diff options
Diffstat (limited to 'usr/src/uts/common/os')
37 files changed, 2812 insertions, 597 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -47,6 +48,7 @@ #include <sys/time.h> #include <sys/msacct.h> #include <sys/zone.h> +#include <sys/brand.h> /* * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t) * On exit, write a record on the accounting file. */ void -acct(char st) +acct(int st) { struct vnode *vp; struct cred *cr; @@ -402,6 +404,21 @@ acct(char st) * This only gets called from exit after all lwp's have exited so no * cred locking is needed. */ + + /* If there is a brand-specific hook, use it instead */ + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { + ZBROP(curzone)->b_acct_out(vp, st); + mutex_exit(&ag->aclock); + return; + } + + /* + * The 'st' status value was traditionally masked this way by our + * caller, but we now accept the unmasked value for brand handling. + * Zones not using the brand hook mask the status here. + */ + st &= 0xff; + p = curproc; ua = PTOU(p); bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 8b3177b916..fa3555a82a 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp) /*ARGSUSED*/ int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, - intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + intpdata_t *idatap, int level, size_t *execsz, int setid, + caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand, + char *bname, char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 9e498dc1c7..e4b1db84e1 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..a147b1cf0f 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2019 Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type) /* * Determine what rootvp to use. */ + mutex_enter(&curproc->p_lock); if (core_type == CORE_PROC) { rootvp = (PTOU(curproc)->u_rdir == NULL ? curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type) VN_HOLD(startvp); if (rootvp != rootdir) VN_HOLD(rootvp); + mutex_exit(&curproc->p_lock); if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp, startvp, CRED())) != 0) { pn_free(&pn); @@ -793,7 +795,7 @@ clock_t core_delay_usec = 10000; * using core_write() below, and so it has the same failure semantics. */ int -core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, +core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size, rlim64_t rlimit, cred_t *credp) { caddr_t eaddr; @@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, size_t len; int err = 0; + if (offset > OFF_MAX || offset + size > OFF_MAX || + offset + size < offset) { + return (EOVERFLOW); + } + eaddr = addr + size; for (base = addr; base < eaddr; base += len) { len = eaddr - base; @@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, * unexpectedly returns zero but no progress has been made, we return ENOSPC. */ int -core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset, +core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset, const void *buf, size_t len, rlim64_t rlimit, cred_t *credp) { ssize_t resid = len; int error = 0; + if (offset > OFF_MAX || offset + len > OFF_MAX || + offset + len < offset) { + return (EOVERFLOW); + } + while (len != 0) { - error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset, - segflg, 0, rlimit, credp, &resid); + error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, + (offset_t)offset, segflg, 0, rlimit, credp, &resid); if (error != 0) break; diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 075bb6e70a..6a86dbb8cb 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -112,7 +112,7 @@ cpu_t *cpu_list; /* list of all CPUs */ cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ cpuset_t cpu_active_set; /* cached set of active CPUs */ -static cpuset_t cpu_available; /* set of available CPUs */ +cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */ diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 3e1df330b7..5e909667de 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -730,6 +730,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index 8faa8fea8c..2433c504fc 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c index 484b2042e2..868ed9e5c4 100644 --- a/usr/src/uts/common/os/dumpsubr.c +++ b/usr/src/uts/common/os/dumpsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ @@ -75,6 +75,7 @@ #include <sys/cpu.h> #include <bzip2/bzlib.h> +#include <crypto/chacha/chacha.h> #define ONE_GIG (1024 * 1024 * 1024UL) @@ -112,6 +113,8 @@ int dump_timeout = 120; /* timeout for dumping pages */ int dump_timeleft; /* portion of dump_timeout remaining */ int dump_ioerr; /* dump i/o error */ int dump_check_used; /* enable check for used pages */ +uint8_t dump_crypt_key[DUMP_CRYPT_KEYLEN]; /* dump encryption key */ +uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; /* dump nonce */ char *dump_stack_scratch; /* scratch area for saving stack summary */ /* @@ -357,6 +360,7 @@ typedef struct dumpsync { hrtime_t iotime; /* time spent writing nwrite bytes */ hrtime_t iowait; /* time spent waiting for output */ hrtime_t iowaitts; /* iowait timestamp */ + hrtime_t crypt; /* time spent encrypting */ perpage_t perpage; /* metrics */ perpage_t perpagets; int dumpcpu; /* master cpu */ @@ -435,6 +439,7 @@ typedef struct dumpbuf { char *cur; /* dump write pointer */ char *start; /* dump buffer address */ char *end; /* dump buffer end */ + char *scratch; /* scratch buffer */ size_t size; /* size of dumpbuf in bytes */ size_t iosize; /* best transfer size for device */ } dumpbuf_t; @@ -493,11 +498,16 @@ dumpbuf_resize(void) if (new_size <= old_size) return; /* no need to reallocate buffer */ - new_buf = kmem_alloc(new_size, KM_SLEEP); + /* + * Allocate thrice the size of buffer to allow for space for the stream + * and its ciphertext should encryption be enabled (or become so). + */ + new_buf = kmem_alloc(new_size * 3, KM_SLEEP); dumpbuf.size = new_size; dumpbuf.start = new_buf; dumpbuf.end = new_buf + new_size; - kmem_free(old_buf, old_size); + dumpbuf.scratch = dumpbuf.end + new_size; + kmem_free(old_buf, old_size * 3); } /* @@ -1125,9 +1135,16 @@ dumphdr_init(void) dumphdr->dump_pagesize = PAGESIZE; dumphdr->dump_utsname = utsname; (void) strcpy(dumphdr->dump_platform, platform); + + /* + * Allocate our buffer, assuring enough room for encryption + * should it become configured. + */ dumpbuf.size = dumpbuf_iosize(maxphys); - dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); + dumpbuf.start = kmem_alloc(dumpbuf.size * 3, KM_SLEEP); dumpbuf.end = dumpbuf.start + dumpbuf.size; + dumpbuf.scratch = dumpbuf.end + dumpbuf.size; + dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); LOCK_INIT_HELD(&dumpcfg.helper_lock); @@ -1317,6 +1334,41 @@ dumpfini(void) dumppath = NULL; } +static void +dumpvp_encrypt(size_t size) +{ + size_t nelems = size / sizeof (uint64_t), i; + uint64_t *start = (uint64_t *)dumpbuf.start; + uint64_t *stream = (uint64_t *)dumpbuf.end; + uint64_t *crypt = (uint64_t *)dumpbuf.scratch; + uint64_t ctr = dumpbuf.vp_off >> DUMP_CRYPT_BLOCKSHIFT; + hrtime_t ts = gethrtime(); + offset_t dumpoff = dumpbuf.vp_off; + chacha_ctx_t ctx; + + /* + * Our size should be 64-bit aligned and our offset must be aligned + * to our crypto blocksize. + */ + ASSERT(!(size & (sizeof (uint64_t) - 1))); + ASSERT(!(dumpbuf.vp_off & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1))); + + chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0); + chacha_ivsetup(&ctx, dump_crypt_nonce, (uint8_t *)&ctr); + + for (i = 0; i < nelems; i++) { + stream[i] = dumpoff; + dumpoff += sizeof (uint64_t); + } + + chacha_encrypt_bytes(&ctx, (uint8_t *)stream, (uint8_t *)crypt, size); + + for (i = 0; i < nelems; i++) + start[i] ^= crypt[i]; + + dumpsync.crypt += gethrtime() - ts; +} + static offset_t dumpvp_flush(void) { @@ -1328,6 +1380,17 @@ dumpvp_flush(void) dump_ioerr = ENOSPC; dumpbuf.vp_off = dumpbuf.vp_limit; } else if (size != 0) { + /* + * If our dump is encrypted and this is neither the initial + * dump header nor the terminal dump header and metrics, + * encrypt the buffer before writing it. + */ + if ((dump_conflags & DUMP_ENCRYPT) && + dumpbuf.vp_off > dumphdr->dump_start && + dumpbuf.vp_off < dumpbuf.vp_limit - DUMP_OFFSET) { + dumpvp_encrypt(size); + } + iotime = gethrtime(); dumpsync.iowait += iotime - dumpsync.iowaitts; if (panicstr) @@ -2618,6 +2681,7 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); + P("..crypt nsec,%lld\n", (u_longlong_t)ds->crypt); P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); P("dumpbuf.size,%ld\n", dumpbuf.size); @@ -2658,6 +2722,29 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) } #endif /* COLLECT_METRICS */ +CTASSERT(DUMP_CRYPT_HMACLEN <= sizeof (struct utsname)); + +/* + * Mark the dump as encrypted and calculate our (crude) HMAC based on the + * dump_utsname. (The purpose of the HMAC is to merely allow for incorrect + * keys to be quickly rejected.) + */ +void +dumpsys_crypt(dumphdr_t *dumphdr, dump_crypt_t *dcrypt) +{ + chacha_ctx_t ctx; + + dumphdr->dump_flags |= DF_ENCRYPTED; + bcopy(dump_crypt_nonce, dcrypt->dump_crypt_nonce, DUMP_CRYPT_NONCELEN); + dcrypt->dump_crypt_algo = DUMP_CRYPT_ALGO_CHACHA20; + + chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0); + chacha_ivsetup(&ctx, dump_crypt_nonce, NULL); + + chacha_encrypt_bytes(&ctx, (uint8_t *)&dumphdr->dump_utsname, + (uint8_t *)&dcrypt->dump_crypt_hmac, DUMP_CRYPT_HMACLEN); +} + /* * Dump the system. */ @@ -2679,6 +2766,7 @@ dumpsys(void) dumpmlw_t mlw; dumpcsize_t datatag; dumpdatahdr_t datahdr; + dump_crypt_t dcrypt; if (dumpvp == NULL || dumphdr == NULL) { uprintf("skipping system dump - no dump device configured\n"); @@ -2733,6 +2821,9 @@ dumpsys(void) /* Make sure nodename is current */ bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); + if (dump_conflags & DUMP_ENCRYPT) + dumpsys_crypt(dumphdr, &dcrypt); + /* * If this is a live dump, try to open a VCHR vnode for better * performance. We must take care to flush the buffer cache @@ -2999,11 +3090,19 @@ dumpsys(void) */ dumpbuf.vp_off = dumphdr->dump_start; dumpvp_write(dumphdr, sizeof (dumphdr_t)); + + if (dump_conflags & DUMP_ENCRYPT) + dumpvp_write(&dcrypt, sizeof (dump_crypt_t)); + (void) dumpvp_flush(); dumpbuf.vp_limit = dumpvp_size; dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; dumpvp_write(dumphdr, sizeof (dumphdr_t)); + + if (dump_conflags & DUMP_ENCRYPT) + dumpvp_write(&dcrypt, sizeof (dump_crypt_t)); + dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index f51e2c5ca1..62d1e298dd 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -24,9 +24,9 @@ */ /* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * These are consumed within the specific exec modules, but are defined here @@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp, proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); struct user *up = PTOU(p); - long execsz; /* temporary count of exec size */ + size_t execsz; /* temporary count of exec size */ int i; int error; char exec_file[MAXCOMLEN+1]; @@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp, * only if the pathname does not contain a "/" the resolved path * points to a file in the current working (attribute) directory. */ - if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && + mutex_enter(&p->p_lock); + if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 && strchr(resolvepn.pn_path, '/') == NULL) { + mutex_exit(&p->p_lock); if (dir != NULL) VN_RELE(dir); error = EACCES; @@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp, VN_RELE(vp); goto out; } + mutex_exit(&p->p_lock); bzero(exec_file, MAXCOMLEN+1); (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -566,10 +603,10 @@ gexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -890,8 +927,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1448,7 +1491,7 @@ noexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, int setid, caddr_t exec_file, struct cred *cred) @@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1647,7 +1712,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) } } argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp; - args->arglen = args->stk_strp - args->stk_base; + args->argstrlen = args->stk_strp - args->stk_base; + + const char *envstr = args->stk_strp; /* * Add environ[] strings to the stack. @@ -1669,12 +1736,15 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) envp += ptrsize; } } + + args->envstrlen = args->stk_strp - envstr; args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp; args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1687,6 +1757,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1743,46 +1827,53 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) */ if (stk_putptr(args, usp, (char *)(uintptr_t)argc)) return (-1); + usp += ptrsize; /* - * Add argc space (ptrsize) to usp and record argv for /proc. + * For the benefit of /proc, record the user address of the argv[] array + * as well as the start of the argv string space (argv[0]). */ - up->u_argv = (uintptr_t)(usp += ptrsize); + up->u_argv = (uintptr_t)usp; + up->u_argvstrs = (uintptr_t)(&ustrp[*(offp - 1)]); + up->u_argvstrsize = args->argstrlen; /* - * Put the argv[] pointers on the stack. + * Put the argv[] pointers on the stack, including a NULL terminator. */ for (i = 0; i < argc; i++, usp += ptrsize) if (stk_putptr(args, usp, &ustrp[*--offp])) return (-1); + usp += ptrsize; /* * Copy arguments to u_psargs. */ - pslen = MIN(args->arglen, PSARGSZ) - 1; + pslen = MIN(args->argstrlen, PSARGSZ) - 1; for (i = 0; i < pslen; i++) up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]); while (i < PSARGSZ) up->u_psargs[i++] = '\0'; /* - * Add space for argv[]'s NULL terminator (ptrsize) to usp and - * record envp for /proc. + * For the benefit of /proc, record the user address of the envp[] array + * as well as the start of the envp string space (envp[0]). */ - up->u_envp = (uintptr_t)(usp += ptrsize); + up->u_envp = (uintptr_t)usp; + up->u_envstrs = (uintptr_t)(&ustrp[*(offp - 1)]); + up->u_envstrsize = args->envstrlen; /* - * Put the envp[] pointers on the stack. + * Put the envp[] pointers on the stack, including a NULL terminator. */ for (i = 0; i < envc; i++, usp += ptrsize) if (stk_putptr(args, usp, &ustrp[*--offp])) return (-1); + usp += ptrsize; /* - * Add space for envp[]'s NULL terminator (ptrsize) to usp and - * remember where the stack ends, which is also where auxv begins. + * Remember where the stack ends, which is also where auxv begins. */ - args->stackend = usp += ptrsize; + args->stackend = usp; /* * Put all the argv[], envp[], and auxv strings on the stack. @@ -1793,7 +1884,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1806,6 +1897,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1818,6 +1914,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1961,6 +2062,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) @@ -2056,7 +2160,7 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) delete_itimer_realprof(); if (AU_AUDITING()) - audit_exec(args->stk_base, args->stk_base + args->arglen, + audit_exec(args->stk_base, args->stk_base + args->argstrlen, args->na - args->ne, args->ne, args->pfcred); /* diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 5a9355ae9f..7ccf9b3221 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -141,11 +141,32 @@ rexit(int rval) } /* + * Bump the init_restarts kstat and let interested parties know about the + * restart. + */ +static void +restart_init_notify(zone_t *zone) +{ + nvlist_t *nvl = NULL; + + zone->zone_proc_init_restarts++; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 && + nvlist_add_uint32(nvl, ZONE_CB_RESTARTS, + zone->zone_proc_init_restarts) == 0) { + zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS, + ZONE_EVENT_INIT_RESTART_SC, nvl); + } + + nvlist_free(nvl); +} + +/* * Called by proc_exit() when a zone's init exits, presumably because * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things * which are usually inherited across exec() but will break init's - * assumption that it is being exec()'d from a virgin process. Most + * assumption that it is being exec()'d from a virgin process. Most * importantly this includes closing all file descriptors (exec only * closes those marked close-on-exec) and resetting signals (exec only * resets handled signals, and we need to clear any signals which @@ -234,7 +255,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -286,6 +307,8 @@ restart_init(int what, int why) ASSERT(p == curproc); (void) freectty(B_TRUE); + restart_init_notify(p->p_zone); + /* * Now exec() the new init(8) on top of the current process. If we * succeed, the caller will treat this like a successful system call. @@ -320,7 +343,7 @@ exit(int why, int what) /* * If proc_exit() fails, then some other lwp in the process * got there first. We just have to call lwp_exit() to allow - * the other lwp to finish exiting the process. Otherwise we're + * the other lwp to finish exiting the process. Otherwise we're * restarting init, and should return. */ if (proc_exit(why, what) != 0) { @@ -333,7 +356,7 @@ exit(int why, int what) /* * Set the SEXITING flag on the process, after making sure /proc does - * not have it locked. This is done in more places than proc_exit(), + * not have it locked. This is done in more places than proc_exit(), * so it is a separate function. */ void @@ -380,8 +403,9 @@ zone_init_exit(zone_t *z, int why, int what) */ if (!z->zone_restart_init) { /* - * The zone has been set up to halt when init exits. + * The zone has been setup to halt when init exits. */ + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); z->zone_proc_initpid = -1; return (B_FALSE); @@ -421,6 +445,7 @@ zone_init_exit(zone_t *z, int why, int what) (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred()); } + z->zone_init_status = wstat(why, what); z->zone_proc_initpid = -1; return (B_FALSE); } @@ -441,14 +466,16 @@ zone_init_exit(zone_t *z, int why, int what) /* * No restart modifiers on the zone, attempt to restart init. */ - if (restart_init(what, why) == 0) + if (restart_init(what, why) == 0) { return (B_TRUE); + } } /* * The restart failed, or the criteria for a restart are not met; * the zone will shut down. */ + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); z->zone_proc_initpid = -1; return (B_FALSE); @@ -483,7 +510,7 @@ proc_exit(int why, int what) /* * Stop and discard the process's lwps except for the current one, - * unless some other lwp beat us to it. If exitlwps() fails then + * unless some other lwp beat us to it. If exitlwps() fails then * return and the calling lwp will call (or continue in) lwp_exit(). */ proc_is_exiting(p); @@ -501,19 +528,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -527,6 +541,32 @@ proc_exit(int why, int what) return (0); } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -693,7 +733,7 @@ proc_exit(int why, int what) semexit(p); rv = wstat(why, what); - acct(rv & 0xff); + acct(rv); exacct_commit_proc(p, rv); /* @@ -786,10 +826,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -801,7 +853,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -959,7 +1012,7 @@ proc_exit(int why, int what) * curthread's proc pointer is changed to point to the 'sched' * process for the corresponding zone, except in the case when * the exiting process is in fact a zsched instance, in which - * case the proc pointer is set to p0. We do so, so that the + * case the proc pointer is set to p0. We do so, so that the * process still points at the right zone when we call the VN_RELE() * below. * @@ -975,8 +1028,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -1001,7 +1096,7 @@ proc_exit(int why, int what) /* * task_rele() may ultimately cause the zone to go away (or * may cause the last user process in a zone to go away, which - * signals zsched to go away). So prior to this call, we must + * signals zsched to go away). So prior to this call, we must * no longer point at zsched. */ t->t_procp = &p0; @@ -1055,10 +1150,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -1086,7 +1180,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -1106,10 +1201,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; - proc_gone = 0; + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } + + if (pp->p_child == NULL) { + goto no_real_children; + } + } + + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -1117,6 +1239,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1161,12 +1288,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1235,11 +1366,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1258,7 +1390,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1354,6 +1486,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index c25564d85f..f6179cf301 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,8 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -488,7 +489,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */ afd->a_fd[i] = -1; } -static void +void set_active_fd(int fd) { afd_t *afd = &curthread->t_activefd; @@ -958,7 +959,22 @@ closef(file_t *fp) vp = fp->f_vnode; - error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); + /* + * The __FLXPATH flag is a private interface for use by the lx + * brand in order to emulate open(O_NOFOLLOW|O_PATH) which, + * when a symbolic link is encountered, returns a file + * descriptor which references it. + * See uts/common/brand/lx/syscall/lx_open.c + * + * When this flag is set, VOP_OPEN() will not have been called when + * this file descriptor was opened, and VOP_CLOSE() should not be + * called here (for a symlink, most filesystems would return ENOSYS + * anyway) + */ + if (fp->f_flag2 & (__FLXPATH >> 16)) + error = 0; + else + error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); if (count > 1) { mutex_exit(&fp->f_tlock); @@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) mutex_enter(&fp->f_tlock); fp->f_count = 1; fp->f_flag = (ushort_t)flag; - fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16; + fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16; fp->f_vnode = vp; fp->f_offset = 0; fp->f_audit_data = 0; diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index 1caa0b9b7b..183e1f4333 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -706,7 +707,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -755,7 +756,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -792,6 +793,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -810,6 +814,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -853,8 +858,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -915,7 +930,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -956,7 +972,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1017,6 +1037,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1084,9 +1107,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1171,6 +1191,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); @@ -1188,6 +1220,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) */ fcnt_add(P_FINFO(pp), 1); + mutex_enter(&pp->p_lock); if (PTOU(pp)->u_cdir) { VN_HOLD(PTOU(pp)->u_cdir); } else { @@ -1201,6 +1234,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd) refstr_hold(PTOU(pp)->u_cwd); + mutex_exit(&pp->p_lock); /* * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index da53bce24e..6e2d3c403c 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@ /* * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \ !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint)) -static int +int smmap_common(caddr_t *addrp, size_t len, int prot, int flags, struct file *fp, offset_t pos) { @@ -771,8 +783,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -780,10 +790,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 86cb867da8..bf917ef716 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index 394235f26c..4d2c1e6c10 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 93c04cff8d..b09b2d3558 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -198,6 +198,9 @@ struct { kstat_named_t pagesfree; kstat_named_t pageslocked; kstat_named_t pagestotal; + kstat_named_t lowmemscan; + kstat_named_t zonecapscan; + kstat_named_t nthrottle; } system_pages_kstat = { { "physmem", KSTAT_DATA_ULONG }, { "nalloc", KSTAT_DATA_ULONG }, @@ -219,6 +222,9 @@ struct { { "pagesfree", KSTAT_DATA_ULONG }, { "pageslocked", KSTAT_DATA_ULONG }, { "pagestotal", KSTAT_DATA_ULONG }, + { "low_mem_scan", KSTAT_DATA_ULONG }, + { "zone_cap_scan", KSTAT_DATA_ULONG }, + { "n_throttle", KSTAT_DATA_ULONG }, }; static int header_kstat_update(kstat_t *, int); @@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw) system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial - availrmem); system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages; + system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan; + system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan; + system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle; /* * pp_kernel represents total pages used by the kernel since the * startup. This formula takes into account the boottime kernel diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index b5f41d93f9..6a922343e7 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -23,6 +23,8 @@ * Copyright 2020 Oxide Computer Company * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2022 Joyent, Inc. + * Copyright 2022 MNX Cloud, Inc. */ #include <sys/types.h> @@ -260,8 +262,11 @@ log_init(void) #ifdef LEGACY_BANNER printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + /* + * Note: In the future this should be 2022-20XX, and delete this + * comment when we don't need it anymore + */ + printf("Copyright 2022 MNX Cloud, Inc.\n"); #else bootbanner_print(log_bootbanner_print, KM_SLEEP); #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index b2adae570f..341e4ae356 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + lwp_fp_init(lwp); if (state == TS_RUN) { @@ -755,8 +783,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -893,13 +953,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -943,6 +996,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1103,7 +1168,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 148916d4d8..c57f8a7d2c 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -159,7 +159,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -289,7 +289,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)exec_fnamep, (const char **)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 4c4e78578b..fd74dd3092 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp) * Put pressure on pageout. */ page_needfree(free_get); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); mutex_enter(&mhp->mh_mutex); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index d85df39a62..819d32116d 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1367,10 +1367,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 37389a6e4d..d48be19c71 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -113,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -521,6 +533,19 @@ sprunlock(proc_t *p) mutex_exit(&p->p_lock); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 0e4bd2c73d..b3f01cfab2 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -57,6 +57,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1275,6 +1276,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2123,6 +2140,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2639,3 +2663,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index 186aafc460..05979dd236 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 81a1b5454a..8f52f4ef3a 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids; kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */ kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; + kmutex_t rctl_lists_lock; rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2870,12 +2873,12 @@ rctl_init(void) * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, * int chargeproc) * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects. * * Return values * 0 - success @@ -2893,6 +2896,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(p != NULL); ASSERT(MUTEX_HELD(&p->p_lock)); + if (proj != NULL) { projp = proj; zonep = proj->kpj_zone; @@ -2936,11 +2940,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, } } - zonep->zone_locked_mem += inc; - projp->kpj_data.kpd_locked_mem += inc; if (chargeproc != 0) { + /* Check for overflow */ + if ((p->p_locked_mem + inc) < p->p_locked_mem) { + ret = EAGAIN; + goto out; + } + if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, + &e, inc, 0) & RCT_DENY) { + ret = EAGAIN; + goto out; + } + p->p_locked_mem += inc; } + + zonep->zone_locked_mem += inc; + projp->kpj_data.kpd_locked_mem += inc; out: mutex_exit(&zonep->zone_mem_lock); return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/port_kernel.h> #include <sys/signal.h> #include <sys/var.h> +#include <sys/policy.h> #include <sys/vmparam.h> #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl; rctl_hndl_t rc_process_semopm; rctl_hndl_t rc_process_portev; rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem; /* * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = { }; /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, + struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ + if (secpolicy_lock_memory(CRED()) == 0) + return (0); + return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { + rcop_no_action, + rcop_no_usage, + rcop_no_set, + proc_maxlockedmem_test +}; + +/* * void rctlproc_default_init() * * Overview @@ -383,6 +406,11 @@ rctlproc_init(void) rctl_add_default_limit("process.max-sigqueue-size", _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); + rc_process_maxlockedmem = rctl_register("process.max-locked-memory", + RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, + ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); + /* * Place minimal set of controls on "sched" process for inheritance by * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index bc6df6afba..6eb1194af3 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -649,16 +653,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index 8f98fcb3f0..d0611eb9bb 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..67a93581dd 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,28 @@ issig_forreal(void) } /* + * The brand hook name 'b_issig_stop' is a misnomer. + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + int r; + + /* + * The brand hook will return 0 if it would like + * us to drive on, -1 if we should restart + * the loop to check other conditions, or 1 if we + * should terminate the loop. + */ + r = BROP(p)->b_issig_stop(p, lwp); + if (r < 0) { + continue; + } else if (r > 0) { + break; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +695,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +747,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +761,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +993,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1114,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1220,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1248,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1383,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1863,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index c137a498d1..90a9ea6f0f 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -78,6 +78,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <sys/ptms.h> #include <sys/limits.h> #include <c2/audit.h> @@ -3267,6 +3268,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index fdd0c06aee..f2b91365d9 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -26,6 +26,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. * Copyright 2018 Joyent, Inc. * Copyright 2022 Garrett D'Amore diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index 30cc5744c2..7c094a0f20 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5822,6 +5822,12 @@ ddi_ffs(long mask) return (ffs(mask)); } +int +ddi_ffsll(long long mask) +{ + return (ffs(mask)); +} + /* * Find last bit set. Take mask and clear * all but the most significant bit, and @@ -5833,8 +5839,14 @@ ddi_ffs(long mask) int ddi_fls(long mask) { + return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{ while (mask) { - long nx; + long long nx; if ((nx = (mask & (mask - 1))) == 0) break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index e87e6d8d29..dca168b642 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,6 +23,7 @@ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2018, Joyent, Inc. * Copyright 2020 Oxide Computer Company */ @@ -62,8 +63,7 @@ struct mmaplf32a; int access(char *, int); int alarm(int); int auditsys(struct auditcalls *, rval_t *); -int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t); +int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); intptr_t brk(caddr_t); int chdir(char *); int chmod(char *, int); @@ -645,7 +645,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1000,7 +1000,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1092,18 +1092,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1111,7 +1113,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1125,14 +1128,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1144,7 +1148,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1152,7 +1157,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1166,14 +1172,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1201,5 +1208,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index c78a545360..f587430625 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -82,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it) * waiters. p_lock must be held on entry; it will not be dropped by * timer_unlock(). */ +/* ARGSUSED */ static void timer_unlock(proc_t *p, itimer_t *it) { @@ -139,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) it->it_backend->clk_timer_delete(it); - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portev) { port_kevent_t *pev; @@ -201,20 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) static itimer_t * timer_grab(proc_t *p, timer_t tid) { - itimer_t **itp, *it; + itimer_t *it; if (tid < 0) { return (NULL); } mutex_enter(&p->p_lock); - - if ((itp = p->p_itimer) == NULL || tid >= p->p_itimer_sz || - (it = itp[tid]) == NULL) { + if (p->p_itimer == NULL || tid >= p->p_itimer_sz || + (it = p->p_itimer[tid]) == NULL) { mutex_exit(&p->p_lock); return (NULL); } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (it->it_lock & ITLK_REMOVE) { @@ -236,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid) * should not be held on entry; timer_release() will acquire p_lock but * will drop it before returning. */ -static void +void timer_release(proc_t *p, itimer_t *it) { mutex_enter(&p->p_lock); @@ -249,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it) * p_lock should not be held on entry; timer_delete_grabbed() will acquire * p_lock, but will drop it before returning. */ -static void +void timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it) { mutex_enter(&p->p_lock); @@ -464,6 +465,9 @@ timer_fire(itimer_t *it) it->it_pending = 1; port_send_event((port_kevent_t *)it->it_portev); mutex_exit(&it->it_mutex); + } else if (it->it_flags & IT_CALLBACK) { + it->it_cb_func(it); + ASSERT(MUTEX_NOT_HELD(&it->it_mutex)); } else if (it->it_flags & IT_SIGNAL) { it->it_pending = 1; mutex_exit(&it->it_mutex); @@ -580,85 +584,27 @@ done: return (B_TRUE); } +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend. Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete(). This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, + itimer_t **itp, timer_t *tidp) { - struct sigevent ev; proc_t *p = curproc; - clock_backend_t *backend; + int error = 0; itimer_t *it; sigqueue_t *sigq; - cred_t *cr = CRED(); - int error = 0; - timer_t i; - port_notify_t tim_pnevp; - port_kevent_t *pkevp = NULL; - - if ((backend = CLOCK_BACKEND(clock)) == NULL) - return (set_errno(EINVAL)); - - if (evp != NULL) { - /* - * short copyin() for binary compatibility - * fetch oldsigevent to determine how much to copy in. - */ - if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(evp, &ev, sizeof (struct oldsigevent))) - return (set_errno(EFAULT)); - - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, - sizeof (port_notify_t))) - return (set_errno(EFAULT)); - } -#ifdef _SYSCALL32_IMPL - } else { - struct sigevent32 ev32; - port_notify32_t tim_pnevp32; - - if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) - return (set_errno(EFAULT)); - ev.sigev_notify = ev32.sigev_notify; - ev.sigev_signo = ev32.sigev_signo; - /* - * See comment in sigqueue32() on handling of 32-bit - * sigvals in a 64-bit kernel. - */ - ev.sigev_value.sival_int = ev32.sigev_value.sival_int; - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin((void *)(uintptr_t) - ev32.sigev_value.sival_ptr, - (void *)&tim_pnevp32, - sizeof (port_notify32_t))) - return (set_errno(EFAULT)); - tim_pnevp.portnfy_port = - tim_pnevp32.portnfy_port; - tim_pnevp.portnfy_user = - (void *)(uintptr_t)tim_pnevp32.portnfy_user; - } -#endif - } - switch (ev.sigev_notify) { - case SIGEV_NONE: - break; - case SIGEV_SIGNAL: - if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) - return (set_errno(EINVAL)); - break; - case SIGEV_THREAD: - case SIGEV_PORT: - break; - default: - return (set_errno(EINVAL)); - } - } else { - /* - * Use the clock's default sigevent (this is a structure copy). - */ - ev = backend->clk_default; - } + timer_t tid; /* * We'll allocate our sigqueue now, before we grab p_lock. @@ -674,29 +620,25 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&p->p_lock); - if (!timer_get_id(p, &i)) { + if (!timer_get_id(p, &tid)) { mutex_exit(&p->p_lock); - kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); return (set_errno(EAGAIN)); } - ASSERT(i < p->p_itimer_sz && p->p_itimer[i] == NULL); + ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); /* * If we develop other notification mechanisms, this will need * to call into (yet another) backend. */ - sigq->sq_info.si_signo = ev.sigev_signo; - if (evp == NULL) - sigq->sq_info.si_value.sival_int = i; - else - sigq->sq_info.si_value = ev.sigev_value; + sigq->sq_info.si_signo = evp->sigev_signo; + sigq->sq_info.si_value = evp->sigev_value; sigq->sq_info.si_code = SI_TIMER; sigq->sq_info.si_pid = p->p_pid; sigq->sq_info.si_ctid = PRCTID(p); sigq->sq_info.si_zoneid = getzoneid(); - sigq->sq_info.si_uid = crgetruid(cr); + sigq->sq_info.si_uid = crgetruid(CRED()); sigq->sq_func = timer_signal; sigq->sq_next = NULL; sigq->sq_backptr = it; @@ -704,9 +646,12 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_backend = backend; it->it_lock = ITLK_LOCKED; - if (ev.sigev_notify == SIGEV_THREAD || - ev.sigev_notify == SIGEV_PORT) { + if (evp->sigev_notify == SIGEV_THREAD || + evp->sigev_notify == SIGEV_PORT) { int port; + port_kevent_t *pkevp = NULL; + + ASSERT(pnp != NULL); /* * This timer is programmed to use event port notification when @@ -726,7 +671,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) */ it->it_flags |= IT_PORT; - port = tim_pnevp.portnfy_port; + port = pnp->portnfy_port; /* associate timer as event source with the port */ error = port_associate_ksource(port, PORT_SOURCE_TIMER, @@ -736,7 +681,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* allocate an event structure/slot */ @@ -748,21 +693,21 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* initialize event data */ - port_init_event(pkevp, i, tim_pnevp.portnfy_user, + port_init_event(pkevp, tid, pnp->portnfy_user, timer_port_callback, it); it->it_portev = pkevp; it->it_portfd = port; } else { - if (ev.sigev_notify == SIGEV_SIGNAL) + if (evp->sigev_notify == SIGEV_SIGNAL) it->it_flags |= IT_SIGNAL; } /* Populate the slot now that the timer is prepped. */ - p->p_itimer[i] = it; + p->p_itimer[tid] = it; mutex_exit(&p->p_lock); /* @@ -775,17 +720,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_lwp = ttolwp(curthread); it->it_proc = p; - if (copyout(&i, tid, sizeof (timer_t)) != 0) { - error = EFAULT; - goto err; - } - - /* - * If we're here, then we have successfully created the timer; we - * just need to release the timer and return. - */ - timer_release(p, it); - + *itp = it; + *tidp = tid; return (0); err: @@ -796,11 +732,115 @@ err: * impossible for a removal to be pending. */ ASSERT(!(it->it_lock & ITLK_REMOVE)); - timer_delete_grabbed(p, i, it); + timer_delete_grabbed(p, tid, it); + + return (error); +} + + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ + int error = 0; + proc_t *p = curproc; + clock_backend_t *backend; + struct sigevent ev; + itimer_t *it; + timer_t tid; + port_notify_t tim_pnevp; + + if ((backend = CLOCK_BACKEND(clock)) == NULL) + return (set_errno(EINVAL)); + + if (evp != NULL) { + /* + * short copyin() for binary compatibility + * fetch oldsigevent to determine how much to copy in. + */ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(evp, &ev, sizeof (struct oldsigevent))) + return (set_errno(EFAULT)); + + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, + sizeof (port_notify_t))) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + } else { + struct sigevent32 ev32; + port_notify32_t tim_pnevp32; - return (set_errno(error)); + if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) + return (set_errno(EFAULT)); + ev.sigev_notify = ev32.sigev_notify; + ev.sigev_signo = ev32.sigev_signo; + /* + * See comment in sigqueue32() on handling of 32-bit + * sigvals in a 64-bit kernel. + */ + ev.sigev_value.sival_int = ev32.sigev_value.sival_int; + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin((void *)(uintptr_t) + ev32.sigev_value.sival_ptr, + (void *)&tim_pnevp32, + sizeof (port_notify32_t))) + return (set_errno(EFAULT)); + tim_pnevp.portnfy_port = + tim_pnevp32.portnfy_port; + tim_pnevp.portnfy_user = + (void *)(uintptr_t)tim_pnevp32.portnfy_user; + } +#endif + } + switch (ev.sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) + return (set_errno(EINVAL)); + break; + case SIGEV_THREAD: + case SIGEV_PORT: + break; + default: + return (set_errno(EINVAL)); + } + } else { + /* + * Use the clock's default sigevent (this is a structure copy). + */ + ev = backend->clk_default; + } + + if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * Populate si_value with the timer ID if no sigevent was passed in. + */ + if (evp == NULL) { + it->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + timer_delete_grabbed(p, tid, it); + return (set_errno(EFAULT)); + } + + /* + * If we're here, then we have successfully created the timer; we + * just need to release the timer and return. + */ + timer_release(p, it); + + return (0); } + int timer_gettime(timer_t tid, itimerspec_t *val) { @@ -923,17 +963,20 @@ timer_lwpexit(void) uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } for (i = 0; i < p->p_itimer_sz; i++) { - if ((it = itp[i]) == NULL) + if ((it = p->p_itimer[i]) == NULL) { continue; + } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -967,17 +1010,19 @@ timer_lwpbind() uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } for (i = 0; i < p->p_itimer_sz; i++) { - if ((it = itp[i]) == NULL) + if ((it = p->p_itimer[i]) == NULL) continue; + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -1068,7 +1113,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose) for (tid = 0; tid < timer_max; tid++) { if ((it = timer_grab(p, tid)) == NULL) continue; - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portfd == port) { port_kevent_t *pev; diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index c3fd3658d6..1df2f479a5 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -27,6 +27,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -64,6 +65,7 @@ #include <sys/tnf_probe.h> #include <sys/mem_cage.h> #include <sys/time.h> +#include <sys/zone.h> #include <sys/stdbool.h> #include <vm/hat.h> @@ -240,15 +242,22 @@ pgcnt_t lotsfree = 0; pgcnt_t needfree = 0; pgcnt_t throttlefree = 0; pgcnt_t pageout_reserve = 0; +pri_t pageout_pri; pgcnt_t deficit; pgcnt_t nscan; pgcnt_t desscan; +/* kstats */ +uint64_t low_mem_scan; +uint64_t zone_cap_scan; + +#define MAX_PSCAN_THREADS 16 + /* - * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the - * number of nanoseconds in each wakeup cycle that gives the equivalent of some - * underlying %CPU duty cycle. + * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and + * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle + * that gives the equivalent of some underlying %CPU duty cycle. * * min_pageout_nsec: * nanoseconds/wakeup equivalent of min_percent_cpu. @@ -260,15 +269,31 @@ pgcnt_t desscan; * Number of nanoseconds budgeted for each wakeup cycle. * Computed each time around by schedpaging(). * Varies between min_pageout_nsec and max_pageout_nsec, - * depending on memory pressure. + * depending on memory pressure or zones over their cap. + * + * zone_pageout_nsec: + * Number of nanoseconds budget for each cycle when a zone + * is over its memory cap. If this is zero, then the value + * of max_pageout_nsec is used instead. */ static hrtime_t min_pageout_nsec; static hrtime_t max_pageout_nsec; static hrtime_t pageout_nsec; +static hrtime_t zone_pageout_nsec; -static uint_t reset_hands; +static boolean_t reset_hands[MAX_PSCAN_THREADS]; #define PAGES_POLL_MASK 1023 +#define SCHEDPAGING_HZ 4 + +/* + * despagescanners: + * The desired number of page scanner threads. The value can be set in + * /etc/system or tuned directly with 'mdb -kw'. The system will bring + * the actual number of threads into line with the desired number. If set + * to an invalid value, the system will correct the setting. + */ +uint_t despagescanners = 0; /* * pageout_sample_lim: @@ -294,26 +319,29 @@ static uint_t reset_hands; * pageout_scanner(), which then sets this value once per system boot after * enough samples have been recorded (pageout_sample_cnt). Once set, this * new value is used for fastscan and handspreadpages. - * - * sample_start, sample_end: - * The hrtime at which the last pageout_scanner() sample began and ended. */ typedef hrtime_t hrrate_t; static uint64_t pageout_sample_lim = 4; static uint64_t pageout_sample_cnt = 0; static pgcnt_t pageout_sample_pages = 0; +static hrtime_t pageout_sample_etime = 0; static hrrate_t pageout_rate = 0; static pgcnt_t pageout_new_spread = 0; -static hrtime_t pageout_cycle_nsec; -static hrtime_t sample_start, sample_end; -static hrtime_t pageout_sample_etime = 0; +/* True if the page scanner is first starting up */ +#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim) + +/* The current number of page scanner threads */ +static uint_t n_page_scanners = 1; +/* The number of page scanner threads that are actively scanning. */ +static uint_t pageouts_running; /* * Record number of times a pageout_scanner() wakeup cycle finished because it * timed out (exceeded its CPU budget), rather than because it visited - * its budgeted number of pages. + * its budgeted number of pages. This is only done when scanning under low + * free memory conditions, not when scanning for zones over their cap. */ uint64_t pageout_timeouts = 0; @@ -357,9 +385,10 @@ static struct clockinit { pgcnt_t ci_fastscan; pgcnt_t ci_slowscan; pgcnt_t ci_handspreadpages; + uint_t ci_despagescanners; } clockinit = { .ci_init = false }; -static pgcnt_t +static inline pgcnt_t clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) { if (value < minimum) { @@ -382,6 +411,83 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) } /* + * Local boolean to control scanning when zones are over their cap. Avoids + * accessing the zone_num_over_cap variable except within schedpaging(), which + * only runs periodically. This is here only to reduce our access to + * zone_num_over_cap, since it is already accessed a lot during paging, and + * the page scanner accesses the zones_over variable on each page during a + * scan. There is no lock needed for zone_num_over_cap since schedpaging() + * doesn't modify the variable, it only cares if the variable is 0 or non-0. + */ +static boolean_t zones_over = B_FALSE; + +/* + * On large memory systems, multiple instances of the page scanner are run, + * each responsible for a separate region of memory. This speeds up page + * invalidation under low memory conditions. + * + * despagescanners can be set in /etc/system or via mdb and it will + * be used as a guide for how many page scanners to create; the value + * will be adjusted if it is not sensible. Otherwise, the number of + * page scanners is determined dynamically based on handspreadpages. + */ +static void +recalc_pagescanners(void) +{ + pgcnt_t sz; + uint_t des; + + /* If the initial calibration has not been done, take no action. */ + if (pageout_new_spread == 0) + return; + + /* + * If the desired number of scanners is set in /etc/system + * then try to use it. + */ + if (despagescanners == 0 && clockinit.ci_despagescanners != 0) + despagescanners = clockinit.ci_despagescanners; + + if (despagescanners != 0) { + /* + * We have a desired number of page scanners, either from + * /etc/system or set via mdb. Try and use it (it will be + * clamped below). + */ + des = despagescanners; + } else { + /* + * Calculate the number of desired scanners based on the + * system's memory size. + * + * A 64GiB region size is used as the basis for calculating how + * many scanner threads should be created. For systems with up + * to 64GiB of RAM, a single thread is used; for very large + * memory systems the threads are limited to MAX_PSCAN_THREADS. + */ + sz = btop(64ULL << 30); + + if (sz > looppages) { + des = 1; + } else { + pgcnt_t tmp = sz; + + for (des = 1; tmp < looppages; des++) + tmp += sz; + } + } + + /* + * clamp the number of scanners so that we are under MAX_PSCAN_THREADS + * and so that each scanner covers at least 10% more than + * handspreadpages. + */ + des = clamp(des, 1, + looppages / (handspreadpages + handspreadpages / 10)); + despagescanners = clamp(des, 1, MAX_PSCAN_THREADS); +} + +/* * Set up the paging constants for the clock algorithm used by * pageout_scanner(), and by the virtual memory system overall. See the * comments at the top of this file for more information about the threshold @@ -395,7 +501,6 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) void setupclock(void) { - pgcnt_t defval; bool half = (pageout_threshold_style == 1); bool recalc = true; @@ -424,6 +529,7 @@ setupclock(void) clockinit.ci_fastscan = fastscan; clockinit.ci_slowscan = slowscan; clockinit.ci_handspreadpages = handspreadpages; + clockinit.ci_despagescanners = despagescanners; /* * The first call does not trigger a recalculation, only @@ -605,7 +711,7 @@ setupclock(void) } /* - * Handspreadpages is distance (in pages) between front and back + * Handspreadpages is the distance (in pages) between front and back * pageout daemon hands. The amount of time to reclaim a page * once pageout examines it increases with this distance and * decreases as the scan rate rises. It must be < the amount @@ -641,12 +747,31 @@ setupclock(void) } /* - * If we have been called to recalculate the parameters, set a flag to - * re-evaluate the clock hand pointers. + * Establish the minimum and maximum length of time to be spent + * scanning pages per wakeup, limiting the scanner duty cycle. The + * input percentage values (0-100) must be converted to a fraction of + * the number of nanoseconds in a second of wall time, then further + * scaled down by the number of scanner wakeups in a second. */ - if (recalc) { - reset_hands = 1; - } + min_pageout_nsec = MAX(1, + NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); + max_pageout_nsec = MAX(min_pageout_nsec, + NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); + + /* + * If not called for recalculation, return and skip the remaining + * steps. + */ + if (!recalc) + return; + + /* + * Set a flag to re-evaluate the clock hand positions. + */ + for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; + + recalc_pagescanners(); } /* @@ -660,9 +785,8 @@ setupclock(void) * in its next pass; schedpaging() sets this value based on the amount of * currently available memory. */ -#define SCHEDPAGING_HZ 4 -static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ +static kmutex_t pageout_mutex; /* * Pool of available async pageout putpage requests. @@ -690,9 +814,9 @@ static bool pageout_pushing = false; static uint64_t pageout_pushcount = 0; static uint64_t pageout_pushcount_seen = 0; -static int async_list_size = 256; /* number of async request structs */ +static int async_list_size = 8192; /* number of async request structs */ -static void pageout_scanner(void); +static void pageout_scanner(void *); /* * If a page is being shared more than "po_share" times @@ -722,24 +846,17 @@ schedpaging(void *arg) kcage_cageout_wakeup(); if (mutex_tryenter(&pageout_mutex)) { - /* pageout() not running */ + + if (pageouts_running != 0) + goto out; + + /* No pageout scanner threads running. */ nscan = 0; vavail = freemem - deficit; if (pageout_new_spread != 0) vavail -= needfree; - if (vavail < 0) - vavail = 0; - if (vavail > lotsfree) - vavail = lotsfree; + vavail = clamp(vavail, 0, lotsfree); - /* - * Fix for 1161438 (CRS SPR# 73922). All variables - * in the original calculation for desscan were 32 bit signed - * ints. As freemem approaches 0x0 on a system with 1 Gig or - * more of memory, the calculation can overflow. When this - * happens, desscan becomes negative and pageout_scanner() - * stops paging out. - */ if (needfree > 0 && pageout_new_spread == 0) { /* * If we've not yet collected enough samples to @@ -765,14 +882,92 @@ schedpaging(void *arg) pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); - if (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim) { + DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t, + pageout_nsec); + + if (pageout_new_spread != 0 && despagescanners != 0 && + despagescanners != n_page_scanners) { + /* + * We have finished the pagescan initialisation and the + * desired number of page scanners has changed, either + * because initialisation just finished, because of a + * memory DR, or because despagescanners has been + * modified on the fly (i.e. by mdb). + */ + uint_t i, curr_nscan = n_page_scanners; + + /* Re-validate despagescanners */ + recalc_pagescanners(); + + n_page_scanners = despagescanners; + + for (i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; + + /* If we need more scanners, start them now. */ + if (n_page_scanners > curr_nscan) { + for (i = curr_nscan; i < n_page_scanners; i++) { + (void) lwp_kernel_create(proc_pageout, + pageout_scanner, + (void *)(uintptr_t)i, TS_RUN, + pageout_pri); + } + } + + /* + * If the number of scanners has decreased, trigger a + * wakeup so that the excess threads will terminate. + */ + if (n_page_scanners < curr_nscan) { + WAKE_PAGEOUT_SCANNER(); + } + } + + zones_over = B_FALSE; + + if (PAGE_SCAN_STARTUP) { /* - * Either we need more memory, or we still need to - * measure the average scan rate. Wake the scanner. + * We still need to measure the rate at which the + * system is able to scan pages of memory. Each of + * these initial samples is a scan of as much system + * memory as practical, regardless of whether or not we + * are experiencing memory pressure. */ - DTRACE_PROBE(pageout__cv__signal); - cv_signal(&proc_pageout->p_cv); + desscan = total_pages; + pageout_nsec = max_pageout_nsec; + + DTRACE_PROBE(schedpage__wake__sample); + WAKE_PAGEOUT_SCANNER(); + } else if (freemem < lotsfree + needfree) { + /* + * We need more memory. + */ + low_mem_scan++; + + DTRACE_PROBE(schedpage__wake__low); + WAKE_PAGEOUT_SCANNER(); + } else if (zone_num_over_cap > 0) { + /* + * One of more zones are over their cap. + */ + + /* No page limit */ + desscan = total_pages; + + /* + * Increase the scanning CPU% to the max. This implies + * 80% of one CPU/sec if the scanner can run each + * opportunity. Can also be tuned via setting + * zone_pageout_nsec in /etc/system or with mdb. + */ + pageout_nsec = (zone_pageout_nsec != 0) ? + zone_pageout_nsec : max_pageout_nsec; + + zones_over = B_TRUE; + zone_cap_scan++; + + DTRACE_PROBE(schedpage__wake__zone); + WAKE_PAGEOUT_SCANNER(); } else { /* * There are enough free pages, no need to @@ -785,6 +980,7 @@ schedpaging(void *arg) po_share >>= 1; } } +out: mutex_exit(&pageout_mutex); } @@ -813,37 +1009,39 @@ uint_t dopageout = 1; /* * The page out daemon, which runs as process 2. * - * As long as there are at least lotsfree pages, - * this process is not run. When the number of free - * pages stays in the range desfree to lotsfree, - * this daemon runs through the pages in the loop - * at a rate determined in schedpaging(). Pageout manages - * two hands on the clock. The front hand moves through - * memory, clearing the reference bit, - * and stealing pages from procs that are over maxrss. - * The back hand travels a distance behind the front hand, - * freeing the pages that have not been referenced in the time - * since the front hand passed. If modified, they are pushed to - * swap before being freed. + * The daemon treats physical memory as a circular array of pages and scans + * the pages using a 'two-handed clock' algorithm. The front hand moves + * through the pages, clearing the reference bit. The back hand travels a + * distance (handspreadpages) behind the front hand, freeing the pages that + * have not been referenced in the time since the front hand passed. If + * modified, they are first written to their backing store before being + * freed. + * + * In order to make page invalidation more responsive on machines with + * larger memory, multiple pageout_scanner threads may be created. In this + * case, each thread is given a segment of the memory "clock face" so that + * memory can be reclaimed more quickly. * - * There are 2 threads that act on behalf of the pageout process. - * One thread scans pages (pageout_scanner) and frees them up if - * they don't require any VOP_PUTPAGE operation. If a page must be - * written back to its backing store, the request is put on a list - * and the other (pageout) thread is signaled. The pageout thread - * grabs VOP_PUTPAGE requests from the list, and processes them. - * Some filesystems may require resources for the VOP_PUTPAGE - * operations (like memory) and hence can block the pageout - * thread, but the scanner thread can still operate. There is still - * no guarantee that memory deadlocks cannot occur. + * As long as there are at least lotsfree pages, or no zones over their + * cap, then pageout_scanner threads are not run. When pageout_scanner + * threads are running for case (a), all pages are considered for pageout. + * For case (b), only pages belonging to a zone over its cap will be + * considered for pageout. * - * For now, this thing is in very rough form. + * There are multiple threads that act on behalf of the pageout process. A + * set of threads scan pages (pageout_scanner) and frees them up if they + * don't require any VOP_PUTPAGE operation. If a page must be written back + * to its backing store, the request is put on a list and the other + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE + * requests from the list, and processes them. Some filesystems may require + * resources for the VOP_PUTPAGE operations (like memory) and hence can + * block the pageout thread, but the scanner thread can still operate. + * There is still no guarantee that memory deadlocks cannot occur. */ void pageout() { struct async_reqs *arg; - pri_t pageout_pri; int i; pgcnt_t max_pushes; callb_cpr_t cprinfo; @@ -874,11 +1072,12 @@ pageout() push_req[i].a_next = &push_req[i + 1]; } - pageout_pri = curthread->t_pri; + pageout_pri = curthread->t_pri - 1; - /* Create the pageout scanner thread. */ - (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, - pageout_pri - 1); + /* Create the first pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, + (void *)0, /* this is instance 0, not NULL */ + TS_RUN, pageout_pri); /* * kick off pageout scheduler. @@ -913,6 +1112,8 @@ pageout() pageout_pushing = true; mutex_exit(&push_lock); + DTRACE_PROBE(pageout__push); + if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { pushes++; @@ -935,14 +1136,19 @@ pageout() * Kernel thread that scans pages looking for ones to free */ static void -pageout_scanner(void) +pageout_scanner(void *a) { - struct page *fronthand, *backhand; + struct page *fronthand, *backhand, *fronthandstart; + struct page *regionstart, *regionend; uint_t laps; callb_cpr_t cprinfo; - pgcnt_t nscan_limit; + pgcnt_t nscan_cnt, tick; pgcnt_t pcount; - bool sampling; + bool bhwrapping, fhwrapping; + hrtime_t sample_start, sample_end; + uint_t inst = (uint_t)(uintptr_t)a; + + VERIFY3U(inst, <, MAX_PSCAN_THREADS); CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); mutex_enter(&pageout_mutex); @@ -952,113 +1158,153 @@ pageout_scanner(void) * the right point on the assumption that after one circuit things * will have settled down, and restarts shouldn't be that often. */ + reset_hands[inst] = B_TRUE; - /* - * Set the two clock hands to be separated by a reasonable amount, - * but no more than 360 degrees apart. - */ - backhand = page_first(); - if (handspreadpages >= total_pages) { - fronthand = page_nextn(backhand, total_pages - 1); - } else { - fronthand = page_nextn(backhand, handspreadpages); - } - - /* - * Establish the minimum and maximum length of time to be spent - * scanning pages per wakeup, limiting the scanner duty cycle. The - * input percentage values (0-100) must be converted to a fraction of - * the number of nanoseconds in a second of wall time, then further - * scaled down by the number of scanner wakeups in a second: - */ - min_pageout_nsec = MAX(1, - NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); - max_pageout_nsec = MAX(min_pageout_nsec, - NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); + pageouts_running++; + mutex_exit(&pageout_mutex); loop: cv_signal_pageout(); + mutex_enter(&pageout_mutex); + pageouts_running--; CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&proc_pageout->p_cv, &pageout_mutex); CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); + pageouts_running++; + mutex_exit(&pageout_mutex); /* - * Check if pageout has been disabled for debugging purposes: + * Check if pageout has been disabled for debugging purposes. */ if (!dopageout) { goto loop; } /* - * One may reset the clock hands for debugging purposes. Hands will - * also be reset if memory is added to or removed from the system. + * One may reset the clock hands and scanned region for debugging + * purposes. Hands will also be reset on first thread startup, if + * the number of scanning threads (n_page_scanners) changes, or if + * memory is added to, or removed from, the system. */ - if (reset_hands) { - reset_hands = 0; + if (reset_hands[inst]) { + struct page *first; + + reset_hands[inst] = B_FALSE; + + if (inst >= n_page_scanners) { + /* + * The desired number of page scanners has been + * reduced and this instance is no longer wanted. + * Exit the lwp. + */ + VERIFY3U(inst, !=, 0); + DTRACE_PROBE1(pageout__exit, uint_t, inst); + mutex_enter(&pageout_mutex); + pageouts_running--; + mutex_exit(&pageout_mutex); + mutex_enter(&curproc->p_lock); + lwp_exit(); + /* NOTREACHED */ + } + + first = page_first(); + + /* + * Each scanner thread gets its own sector of the memory + * clock face. + */ + pgcnt_t span, offset; - backhand = page_first(); - if (handspreadpages >= total_pages) { - fronthand = page_nextn(backhand, total_pages - 1); + span = looppages / n_page_scanners; + VERIFY3U(span, >, handspreadpages); + + offset = inst * span; + regionstart = page_nextn(first, offset); + if (inst == n_page_scanners - 1) { + /* The last instance goes up to the last page */ + regionend = page_nextn(first, looppages - 1); } else { - fronthand = page_nextn(backhand, handspreadpages); + regionend = page_nextn(regionstart, span - 1); } + + backhand = regionstart; + fronthand = page_nextn(backhand, handspreadpages); + tick = 1; + + bhwrapping = fhwrapping = B_FALSE; + + DTRACE_PROBE4(pageout__reset, uint_t, inst, + pgcnt_t, regionstart, pgcnt_t, regionend, + pgcnt_t, fronthand); } + /* + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. + */ CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); /* * Keep track of the number of times we have scanned all the way around - * the loop: + * the loop on this wakeup. */ laps = 0; - DTRACE_PROBE(pageout__start); - /* * Track the number of pages visited during this scan so that we can * periodically measure our duty cycle. */ + nscan_cnt = 0; pcount = 0; - if (pageout_sample_cnt < pageout_sample_lim) { - /* - * We need to measure the rate at which the system is able to - * scan pages of memory. Each of these initial samples is a - * scan of all system memory, regardless of whether or not we - * are experiencing memory pressure. - */ - nscan_limit = total_pages; - sampling = true; - } else { - nscan_limit = desscan; - sampling = false; - } + DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan, + hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand); + + /* + * Record the initial position of the front hand for this cycle so + * that we can detect when the hand wraps around. + */ + fronthandstart = fronthand; sample_start = gethrtime(); /* * Scan the appropriate number of pages for a single duty cycle. */ - while (nscan < nscan_limit) { + while (nscan_cnt < desscan) { checkpage_result_t rvfront, rvback; - if (!sampling && freemem >= lotsfree + needfree) { + /* + * Only scan while at least one of these is true: + * 1) one or more zones is over its cap + * 2) there is not enough free memory + * 3) during page scan startup when determining sample data + */ + if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree && + !zones_over) { /* * We are not sampling and enough memory has become * available that scanning is no longer required. */ + DTRACE_PROBE1(pageout__memfree, uint_t, inst); break; } + DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount); + /* * Periodically check to see if we have exceeded the CPU duty * cycle for a single wakeup. */ if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { + hrtime_t pageout_cycle_nsec; + pageout_cycle_nsec = gethrtime() - sample_start; if (pageout_cycle_nsec >= pageout_nsec) { - ++pageout_timeouts; + if (!zones_over) + atomic_inc_64(&pageout_timeouts); + DTRACE_PROBE1(pageout__timeout, uint_t, inst); break; } } @@ -1077,7 +1323,8 @@ loop: ++pcount; /* - * Protected by pageout_mutex instead of cpu_stat_lock: + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, scan, 1); @@ -1085,26 +1332,48 @@ loop: * Don't include ineligible pages in the number scanned. */ if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) { - nscan++; + nscan_cnt++; } - backhand = page_next(backhand); - fronthand = page_next(fronthand); + if (bhwrapping) { + backhand = regionstart; + bhwrapping = B_FALSE; + } else { + backhand = page_nextn(backhand, tick); + if (backhand == regionend) + bhwrapping = B_TRUE; + } + + if (fhwrapping) { + fronthand = regionstart; + fhwrapping = B_FALSE; + } else { + fronthand = page_nextn(fronthand, tick); + if (fronthand == regionend) + fhwrapping = B_TRUE; + } /* - * The front hand has wrapped around to the first page in the - * loop. + * The front hand has wrapped around during this wakeup. */ - if (fronthand == page_first()) { + if (fronthand == fronthandstart) { laps++; - DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps); + DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst, + uint_t, laps); /* - * Protected by pageout_mutex instead of cpu_stat_lock: + * This CPU kstat is only incremented here and we're + * obviously on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, rev, 1); - if (laps > 1) { + /* + * then when we wraparound memory we want to try to + * reclaim more pages. + * If scanning only because zones are over their cap, + * then wrapping is common and we simply keep going. + */ + if (laps > 1 && freemem < lotsfree + needfree) { /* * Extremely unlikely, but it happens. * We went around the loop at least once @@ -1123,21 +1392,30 @@ loop: } sample_end = gethrtime(); + atomic_add_long(&nscan, nscan_cnt); - DTRACE_PROBE1(pageout__end, uint_t, laps); + DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps, + pgcnt_t, nscan_cnt, pgcnt_t, pcount) + /* + * The global variables used below are only modified by this thread and + * only during initial scanning when there is a single page scanner + * thread running. + */ if (pageout_new_spread == 0) { - if (pageout_sample_cnt < pageout_sample_lim) { + VERIFY3U(inst, ==, 0); + + if (PAGE_SCAN_STARTUP) { /* * Continue accumulating samples until we have enough - * to get a reasonable value for average scan rate: + * to get a reasonable value for average scan rate. */ pageout_sample_pages += pcount; pageout_sample_etime += sample_end - sample_start; ++pageout_sample_cnt; } - if (pageout_sample_cnt >= pageout_sample_lim) { + if (!PAGE_SCAN_STARTUP) { /* * We have enough samples, set the spread. */ @@ -1223,6 +1501,7 @@ checkpage(struct page *pp, pageout_hand_t whichhand) int isfs = 0; int isexec = 0; int pagesync_flag; + zoneid_t zid = ALL_ZONES; /* * Skip pages: @@ -1265,6 +1544,21 @@ checkpage(struct page *pp, pageout_hand_t whichhand) return (CKP_INELIGIBLE); } + if (zones_over) { + ASSERT(pp->p_zoneid == ALL_ZONES || + pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); + if (pp->p_zoneid == ALL_ZONES || + zone_pdata[pp->p_zoneid].zpers_over == 0) { + /* + * Cross-zone shared page, or zone not over it's cap. + * Leave the page alone. + */ + page_unlock(pp); + return (CKP_INELIGIBLE); + } + zid = pp->p_zoneid; + } + /* * Maintain statistics for what we are freeing */ @@ -1372,6 +1666,11 @@ recheck: VN_RELE(vp); return (CKP_NOT_FREED); } + if (isfs) { + zone_pageout_stat(zid, ZPO_DIRTY); + } else { + zone_pageout_stat(zid, ZPO_ANONDIRTY); + } return (CKP_FREED); } @@ -1398,8 +1697,10 @@ recheck: } else { CPU_STATS_ADD_K(vm, fsfree, 1); } + zone_pageout_stat(zid, ZPO_FS); } else { CPU_STATS_ADD_K(vm, anonfree, 1); + zone_pageout_stat(zid, ZPO_ANON); } return (CKP_FREED); diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index 7d2b89408a..933834aee9 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1629,7 +1629,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index a398830833..fa841df9ff 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -106,14 +106,16 @@ * removed from the list of active zones. zone_destroy() returns, and * the zone can be recreated. * - * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor - * callbacks are executed, and all memory associated with the zone is - * freed. + * ZONE_IS_FREE (internal state): All references have been dropped and + * the zone_t is no longer in the zone_active nor zone_deathrow lists. + * The zone_t is in the process of being freed. This state exists + * only for publishing a sysevent to indicate that the zone by this + * name can be booted again. * - * Threads can wait for the zone to enter a requested state by using - * zone_status_wait() or zone_status_timedwait() with the desired - * state passed in as an argument. Zone state transitions are - * uni-directional; it is not possible to move back to an earlier state. + * Threads can wait for the zone to enter a requested state (other than + * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait() + * with the desired state passed in as an argument. Zone state transitions + * are uni-directional; it is not possible to move back to an earlier state. * * * Zone-Specific Data: @@ -252,6 +254,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -312,6 +316,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -327,8 +332,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -350,6 +355,7 @@ const char *zone_status_table[] = { ZONE_EVENT_SHUTTING_DOWN, /* down */ ZONE_EVENT_SHUTTING_DOWN, /* dying */ ZONE_EVENT_UNINITIALIZED, /* dead */ + ZONE_EVENT_FREE, /* free */ }; /* @@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t); static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); static int zone_set_network(zoneid_t, zone_net_data_t *); static int zone_get_network(zoneid_t, zone_net_data_t *); +static void zone_status_set(zone_t *, zone_status_t); typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); @@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + + ASSERT(MUTEX_HELD(&p->p_lock)); + q = ptob(zp->zpers_pg_cnt); + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zoneid_t zid; + uint_t pg_val; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pdata[zid].zpers_pg_limit = pg_val; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); + zp = &zone_pdata[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts; zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; return (0); @@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_init_restarts, "init_restarts", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; @@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2101,8 +2579,12 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2209,6 +2691,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2250,6 +2747,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2260,6 +2771,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2283,6 +2799,7 @@ zone_init(void) zone0.zone_restart_init = B_TRUE; zone0.zone_reboot_on_init_exit = B_FALSE; zone0.zone_restart_init_0 = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2364,6 +2881,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2379,6 +2898,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -2392,8 +2914,30 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); + /* + * This zone_t can no longer inhibit creation of another zone_t + * with the same name or debug ID. Generate a sysevent so that + * userspace tools know it is safe to carry on. + */ + mutex_enter(&zone_status_lock); + zone_status_set(zone, ZONE_IS_FREE); + mutex_exit(&zone_status_lock); + cpu_uarray_free(zone->zone_ustate); if (zone->zone_rootvp != NULL) @@ -2438,11 +2982,17 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); - ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && - status >= zone_status_get(zone)); + ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE || + status == ZONE_IS_FREE) && status >= zone_status_get(zone)); + + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || @@ -2451,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG (void) printf( "Failed to allocate and send zone state change event.\n"); +#else + /* EMPTY */ #endif } nvlist_free(nvl); @@ -2476,6 +3028,38 @@ zone_status_get(zone_t *zone) return (zone->zone_status); } +/* + * Publish a zones-related sysevent for purposes other than zone state changes. + * While it is unfortunate that zone_event_chan is associated with + * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be + * the only ones with class "status" and subclass "change". + */ +void +zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass, + nvlist_t *ev_nvl) +{ + nvlist_t *nvl = NULL; + timestruc_t now; + uint64_t t; + + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + + if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 || + nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 || + sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com", + "kernel", nvl, EVCH_SLEEP) != 0) { +#ifdef DEBUG + (void) printf("Failed to allocate and send zone misc event.\n"); +#else + /* EMPTY */ +#endif + } + nvlist_free(nvl); +} + static int zone_set_bootargs(zone_t *zone, const char *zone_bootargs) { @@ -2529,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2604,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname) } static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ - uint64_t mcap; - int err = 0; - - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; - - return (err); -} - -static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; @@ -3022,6 +3599,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3768,6 +4351,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3796,9 +4390,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -3839,7 +4478,11 @@ zsched(void *arg) bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); PTOU(pp)->u_argc = 0; PTOU(pp)->u_argv = 0; + PTOU(pp)->u_argvstrs = 0; + PTOU(pp)->u_argvstrsize = 0; PTOU(pp)->u_envp = 0; + PTOU(pp)->u_envstrs = 0; + PTOU(pp)->u_envstrsize = 0; PTOU(pp)->u_commpagep = 0; closeall(P_FINFO(pp)); @@ -4284,8 +4927,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4404,7 +5048,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4476,6 +5120,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4485,6 +5130,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_restart_init = B_TRUE; zone->zone_reboot_on_init_exit = B_FALSE; zone->zone_restart_init_0 = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4551,8 +5197,13 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_max_swap_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); @@ -4561,6 +5212,13 @@ zone_create(const char *zone_name, const char *zone_root, */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -4709,8 +5367,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4849,6 +5507,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4859,7 +5518,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5188,6 +5856,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5221,6 +5890,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -5610,14 +6285,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5681,6 +6348,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5712,10 +6396,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * No attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID) { return (set_errno(EINVAL)); } @@ -5728,11 +6411,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) mutex_exit(&zonehash_lock); /* - * At present most attributes can only be set on non-running, + * At present attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5765,9 +6448,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5795,6 +6475,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6493,6 +7189,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6503,7 +7200,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6604,6 +7301,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6781,7 +7479,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6844,16 +7542,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6921,7 +7618,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6958,6 +7656,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7059,6 +7769,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; zone_t *thiszone; + /* + * Only the GZ may add a datalink to a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may add a + * datalink to a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * When links exist in the GZ, they aren't added to the GZ's + * zone_dl_list. We must enforce this because link_activate() + * depends on zone_check_datalink() returning only NGZs. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((thiszone = zone_find_by_id(zoneid)) == NULL) return (set_errno(ENXIO)); @@ -7091,6 +7822,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; int err = 0; + /* + * Only the GZ may remove a datalink from a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may remove a + * datalink from a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * If we can't add a datalink to the GZ's zone_dl_list then we + * certainly can't remove them either. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((zone = zone_find_by_id(zoneid)) == NULL) return (set_errno(EINVAL)); @@ -7108,25 +7859,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) } /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid. Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned. */ int zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) { zone_t *zone; + zoneid_t zoneid = *zoneidp; + zoneid_t caller = getzoneid(); int err = ENXIO; - if (*zoneidp != ALL_ZONES) { - if ((zone = zone_find_by_id(*zoneidp)) != NULL) { - if (zone_dl_exists(zone, linkid)) + /* + * Only the GZ may enquire about all zones; an NGZ may only + * enuqire about itself. + */ + if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) + zoneid = caller; + + if (zoneid != caller && caller != GLOBAL_ZONEID) + return (err); + + if (zoneid != ALL_ZONES) { + if ((zone = zone_find_by_id(zoneid)) != NULL) { + if (zone_dl_exists(zone, linkid)) { + /* + * We need to set this in case an NGZ + * passes ALL_ZONES. + */ + *zoneidp = zoneid; err = 0; + } zone_rele(zone); } return (err); } + ASSERT(caller == GLOBAL_ZONEID); mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { @@ -7137,6 +7926,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) } } mutex_exit(&zonehash_lock); + return (err); } @@ -7157,6 +7947,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) zone_dl_t *zdl; datalink_id_t *idptr = idarray; + /* + * Only the GZ or the owning zone may look at the datalink list. + */ + if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) + return (set_errno(EPERM)); + if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) return (set_errno(EFAULT)); if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7182,6 +7978,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) @@ -7395,3 +8198,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). + */ + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { + return; + } + + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpers_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_persist_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpers_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpers_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpers_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpers_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpers_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_persist_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpers_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} |