diff options
Diffstat (limited to 'usr/src/uts/common/os')
26 files changed, 1582 insertions, 671 deletions
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c index 96502b8230..b8d2e29058 100644 --- a/usr/src/uts/common/os/bio.c +++ b/usr/src/uts/common/os/bio.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) cpup = CPU; /* get pointer AFTER preemption is disabled */ CPU_STATS_ADDQ(cpup, vm, pgin, 1); CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); + + atomic_add_64(&curzone->zone_pgpgin, btopr(len)); + if ((flags & B_ASYNC) == 0) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) @@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) if (pp != NULL && pp->p_vnode != NULL) { if (IS_SWAPFSVP(pp->p_vnode)) { CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); + atomic_add_64(&curzone->zone_anonpgin, + btopr(len)); } else { if (pp->p_vnode->v_flag & VVMEXEC) { CPU_STATS_ADDQ(cpup, vm, execpgin, btopr(len)); + atomic_add_64(&curzone->zone_execpgin, + btopr(len)); } else { CPU_STATS_ADDQ(cpup, vm, fspgin, btopr(len)); + atomic_add_64(&curzone->zone_fspgin, + btopr(len)); } } } diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index eb8c6e730a..02901d023d 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,112 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +551,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,15 +669,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp) int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, + char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +687,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +703,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -672,13 +737,13 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; if (args->to_model == DATAMODEL_NATIVE) { - err = elfexec(nvp, uap, args, idatap, level + 1, execsz, + err = elfexec(nvp, uap, args, idatap, INTP_MAXDEPTH + 1, execsz, setid, exec_file, cred, brand_action); } #if defined(_LP64) else { - err = elf32exec(nvp, uap, args, idatap, level + 1, execsz, - setid, exec_file, cred, brand_action); + err = elf32exec(nvp, uap, args, idatap, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); } #endif /* _LP64 */ VN_RELE(nvp); @@ -725,7 +790,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +798,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +809,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +830,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +846,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +867,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1007,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1151,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1194,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d4dddbe477..3ca17e1f17 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -64,6 +64,7 @@ #include <sys/contract/process_impl.h> #include <sys/ddi.h> +extern int yield(void); /* * Processes running within a zone potentially dump core in 3 locations, * based on the per-process, per-zone, and the global zone's core settings. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 733fd03a92..b0098946b3 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -727,6 +727,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index f7c565e546..d46b8538a9 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -69,6 +69,7 @@ #include <sys/sdt.h> #include <sys/brand.h> #include <sys/klpd.h> +#include <sys/random.h> #include <c2/audit.h> @@ -97,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * exece() - system call wrapper around exec_common() @@ -297,14 +299,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -336,7 +367,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -360,6 +391,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -419,8 +452,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -544,7 +579,7 @@ gexec( long *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -858,8 +893,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1517,6 +1558,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1553,16 +1615,30 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. + * In the rare case that we have nested interpreters then those names + * and arguments are also copied to the subsequent slots in argv. */ - if (intp != NULL && intp->intp_name != NULL) { - if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0) - return (error); - if (intp->intp_arg != NULL && - (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0) - return (error); + if (intp != NULL && intp->intp_name[0] != NULL) { + int i; + + for (i = 0; i < INTP_MAXDEPTH; i++) { + if (intp->intp_name[i] == NULL) + break; + error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE); + if (error != 0) + return (error); + if (intp->intp_arg[i] != NULL) { + error = stk_add(args, intp->intp_arg[i], + UIO_SYSSPACE); + if (error != 0) + return (error); + } + } + if (args->fname != NULL) error = stk_add(args, args->fname, UIO_SYSSPACE); else @@ -1622,8 +1698,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1636,6 +1713,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1742,7 +1833,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1755,6 +1846,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1767,6 +1863,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1855,6 +1956,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index f0c0983a3a..0e213deb21 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -230,7 +230,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -366,19 +366,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -390,12 +377,35 @@ proc_exit(int why, int what) if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); + + /* + * If the init process should be restarted, the + * "zone_restart_init" member will be set. Some init + * programs in branded zones do not tolerate a restart + * in the traditional manner; setting the + * "zone_reboot_on_init_exit" member will cause the + * entire zone to be rebooted instead. If neither of + * these flags is set the zone will shut down. + */ + if (z->zone_reboot_on_init_exit == B_TRUE && + z->zone_restart_init == B_TRUE) { + /* + * Trigger a zone reboot and continue + * with exit processing. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } + + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); + zone_kcred()); } } @@ -407,6 +417,32 @@ proc_exit(int why, int what) z->zone_proc_initpid = -1; } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -658,10 +694,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -673,7 +721,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -847,8 +896,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; - proc_gone = 0; + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } + + if (pp->p_child == NULL) { + goto no_real_children; + } + } + + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1354,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..bfee77130d 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -852,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index fe3a362fa7..4a9c82d2db 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -696,7 +696,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -745,7 +745,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -1004,6 +1004,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1071,9 +1074,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1153,6 +1153,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index f5e92cfd94..0c4c0bcad6 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -19,7 +19,10 @@ * CDDL HEADER END */ -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. @@ -52,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -522,6 +526,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -537,8 +555,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -547,9 +563,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -717,8 +732,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -726,10 +739,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index cc53c2fb76..734fa910e4 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -158,10 +159,22 @@ * find known objects and is about to free it, or * c) the client has freed the object. * In all these cases (a, b, and c) kmem frees the new object (the - * unused copy destination) and searches for the old object in the - * magazine layer. If found, the object is removed from the magazine - * layer and freed to the slab layer so it will no longer hold the - * slab hostage. + * unused copy destination). In the first case, the object is in + * use and the correct action is that for LATER; in the latter two + * cases, we know that the object is either freed or about to be + * freed, in which case it is either already in a magazine or about + * to be in one. In these cases, we know that the object will either + * be reallocated and reused, or it will end up in a full magazine + * that will be reaped (thereby liberating the slab). Because it + * is prohibitively expensive to differentiate these cases, and + * because the defrag code is executed when we're low on memory + * (thereby biasing the system to reclaim full magazines) we treat + * all DONT_KNOW cases as LATER and rely on cache reaping to + * generally clean up full magazines. While we take the same action + * for these cases, we maintain their semantic distinction: if + * defragmentation is not occurring, it is useful to know if this + * is due to objects in use (LATER) or objects in an unknown state + * of transition (DONT_KNOW). * * 2.3 Object States * @@ -284,10 +297,10 @@ * view of the slab layer, making it a candidate for the move callback. Most * objects unrecognized by the client in the move callback fall into this * category and are cheaply distinguished from known objects by the test - * described earlier. Since recognition is cheap for the client, and searching - * magazines is expensive for kmem, kmem defers searching until the client first - * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem - * elsewhere does what it can to avoid bothering the client unnecessarily. + * described earlier. Because searching magazines is prohibitively expensive + * for kmem, clients that do not mark freed objects (and therefore return + * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation + * efficacy reduced. * * Invalidating the designated pointer member before freeing the object marks * the object to be avoided in the callback, and conversely, assigning a valid @@ -997,6 +1010,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ size_t kmem_content_log_size; /* content log size [2% of memory] */ size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1004,6 +1018,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ + #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ #else @@ -1037,21 +1059,7 @@ static vmem_t *kmem_default_arena; static vmem_t *kmem_firewall_va_arena; static vmem_t *kmem_firewall_arena; -/* - * Define KMEM_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define KMEM_STATS -#endif /* DEBUG */ - -#ifdef KMEM_STATS -#define KMEM_STAT_ADD(stat) ((stat)++) -#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) -#else -#define KMEM_STAT_ADD(stat) /* nothing */ -#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */ -#endif /* KMEM_STATS */ +static int kmem_zerosized; /* # of zero-sized allocs */ /* * kmem slab consolidator thresholds (tunables) @@ -1070,47 +1078,6 @@ size_t kmem_reclaim_max_slabs = 1; */ size_t kmem_reclaim_scan_range = 12; -#ifdef KMEM_STATS -static struct { - uint64_t kms_callbacks; - uint64_t kms_yes; - uint64_t kms_no; - uint64_t kms_later; - uint64_t kms_dont_need; - uint64_t kms_dont_know; - uint64_t kms_hunt_found_mag; - uint64_t kms_hunt_found_slab; - uint64_t kms_hunt_alloc_fail; - uint64_t kms_hunt_lucky; - uint64_t kms_notify; - uint64_t kms_notify_callbacks; - uint64_t kms_disbelief; - uint64_t kms_already_pending; - uint64_t kms_callback_alloc_fail; - uint64_t kms_callback_taskq_fail; - uint64_t kms_endscan_slab_dead; - uint64_t kms_endscan_slab_destroyed; - uint64_t kms_endscan_nomem; - uint64_t kms_endscan_refcnt_changed; - uint64_t kms_endscan_nomove_changed; - uint64_t kms_endscan_freelist; - uint64_t kms_avl_update; - uint64_t kms_avl_noupdate; - uint64_t kms_no_longer_reclaimable; - uint64_t kms_notify_no_longer_reclaimable; - uint64_t kms_notify_slab_dead; - uint64_t kms_notify_slab_destroyed; - uint64_t kms_alloc_fail; - uint64_t kms_constructor_fail; - uint64_t kms_dead_slabs_freed; - uint64_t kms_defrags; - uint64_t kms_scans; - uint64_t kms_scan_depot_ws_reaps; - uint64_t kms_debug_reaps; - uint64_t kms_debug_scans; -} kmem_move_stats; -#endif /* KMEM_STATS */ - /* consolidator knobs */ static boolean_t kmem_move_noreap; static boolean_t kmem_move_blocked; @@ -1141,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; kmem_log_header_t *kmem_slab_log; +kmem_log_header_t *kmem_zerosized_log; static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -1921,15 +1889,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf) cp->cache_complete_slab_count--; avl_add(&cp->cache_partial_slabs, sp); } else { -#ifdef DEBUG - if (avl_update_gt(&cp->cache_partial_slabs, sp)) { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); - } else { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); - } -#else (void) avl_update_gt(&cp->cache_partial_slabs, sp); -#endif } ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == @@ -2941,8 +2901,33 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we log to the kmem_zerosized_log that + * that this condition has occurred (which gives us + * enough information to be able to debug it). + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0; " + "see kmem_zerosized_log for details"); + } + + kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -3556,7 +3541,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw) kmcp->kmc_move_later.value.ui64 = kd->kmd_later; kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; - kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found; + kmcp->kmc_move_hunt_found.value.ui64 = 0; kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; kmcp->kmc_scan.value.ui64 = kd->kmd_scans; @@ -4127,7 +4112,8 @@ kmem_cache_destroy(kmem_cache_t *cp) if (kmem_taskq != NULL) taskq_wait(kmem_taskq); - if (kmem_move_taskq != NULL) + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) taskq_wait(kmem_move_taskq); kmem_cache_magazine_purge(cp); @@ -4465,8 +4451,8 @@ kmem_init(void) } kmem_failure_log = kmem_log_init(kmem_failure_log_size); - kmem_slab_log = kmem_log_init(kmem_slab_log_size); + kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size); /* * Initialize STREAMS message caches so allocb() is available. @@ -4654,94 +4640,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); } -static void * -kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, - void *tbuf) -{ - int i; /* magazine round index */ - - for (i = 0; i < n; i++) { - if (buf == m->mag_round[i]) { - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, tbuf, - caller()); - } - m->mag_round[i] = tbuf; - return (buf); - } - } - - return (NULL); -} - -/* - * Hunt the magazine layer for the given buffer. If found, the buffer is - * removed from the magazine layer and returned, otherwise NULL is returned. - * The state of the returned buffer is freed and constructed. - */ -static void * -kmem_hunt_mags(kmem_cache_t *cp, void *buf) -{ - kmem_cpu_cache_t *ccp; - kmem_magazine_t *m; - int cpu_seqid; - int n; /* magazine rounds */ - void *tbuf; /* temporary swap buffer */ - - ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); - - /* - * Allocated a buffer to swap with the one we hope to pull out of a - * magazine when found. - */ - tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); - if (tbuf == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); - return (NULL); - } - if (tbuf == buf) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, buf, caller()); - } - return (buf); - } - - /* Hunt the depot. */ - mutex_enter(&cp->cache_depot_lock); - n = cp->cache_magtype->mt_magsize; - for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&cp->cache_depot_lock); - return (buf); - } - } - mutex_exit(&cp->cache_depot_lock); - - /* Hunt the per-CPU magazines. */ - for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { - ccp = &cp->cache_cpu[cpu_seqid]; - - mutex_enter(&ccp->cc_lock); - m = ccp->cc_loaded; - n = ccp->cc_rounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - m = ccp->cc_ploaded; - n = ccp->cc_prounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - mutex_exit(&ccp->cc_lock); - } - - kmem_cache_free(cp, tbuf); - return (NULL); -} - /* * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), * or when the buffer is freed. @@ -4805,7 +4703,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *); * NO kmem frees the new buffer, marks the slab of the old buffer * non-reclaimable to avoid bothering the client again * LATER kmem frees the new buffer, increments slab_later_count - * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer + * DONT_KNOW kmem frees the new buffer * DONT_NEED kmem frees both the old buffer and the new buffer * * The pending callback argument now being processed contains both of the @@ -4839,19 +4737,14 @@ kmem_move_buffer(kmem_move_t *callback) * another buffer on the same slab. */ if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { - KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_no_longer_reclaimable); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } /* - * Hunting magazines is expensive, so we'll wait to do that until the - * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer - * is cheap, so we might as well do that here in case we can avoid - * bothering the client. + * Checking the slab layer is easy, so we might as well do that here + * in case we can avoid bothering the client. */ mutex_enter(&cp->cache_lock); free_on_slab = (kmem_slab_allocated(cp, sp, @@ -4859,7 +4752,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); if (free_on_slab) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; @@ -4871,7 +4763,6 @@ kmem_move_buffer(kmem_move_t *callback) */ if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, KM_NOSLEEP, 1, caller()) != 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail); kmem_move_end(cp, callback); return; } @@ -4879,15 +4770,11 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, KM_NOSLEEP) != 0) { atomic_inc_64(&cp->cache_alloc_fail); - KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } - KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_callbacks); cp->cache_defrag->kmd_callbacks++; cp->cache_defrag->kmd_thread = curthread; cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; @@ -4905,7 +4792,6 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_defrag->kmd_to_buf = NULL; if (response == KMEM_CBRC_YES) { - KMEM_STAT_ADD(kmem_move_stats.kms_yes); cp->cache_defrag->kmd_yes++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); /* slab safe to access until kmem_move_end() */ @@ -4920,14 +4806,12 @@ kmem_move_buffer(kmem_move_t *callback) switch (response) { case KMEM_CBRC_NO: - KMEM_STAT_ADD(kmem_move_stats.kms_no); cp->cache_defrag->kmd_no++; mutex_enter(&cp->cache_lock); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_LATER: - KMEM_STAT_ADD(kmem_move_stats.kms_later); cp->cache_defrag->kmd_later++; mutex_enter(&cp->cache_lock); if (!KMEM_SLAB_IS_PARTIAL(sp)) { @@ -4936,7 +4820,6 @@ kmem_move_buffer(kmem_move_t *callback) } if (++sp->slab_later_count >= KMEM_DISBELIEF) { - KMEM_STAT_ADD(kmem_move_stats.kms_disbelief); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, @@ -4945,7 +4828,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_NEED: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_need); cp->cache_defrag->kmd_dont_need++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); if (sp->slab_refcnt == 0) @@ -4955,19 +4837,21 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_KNOW: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); + /* + * If we don't know if we can move this buffer or not, we'll + * just assume that we can't: if the buffer is in fact free, + * then it is sitting in one of the per-CPU magazines or in + * a full magazine in the depot layer. Either way, because + * defrag is induced in the same logic that reaps a cache, + * it's likely that full magazines will be returned to the + * system soon (thereby accomplishing what we're trying to + * accomplish here: return those magazines to their slabs). + * Given this, any work that we might do now to locate a buffer + * in a magazine is wasted (and expensive!) work; we bump + * a counter in this case and otherwise assume that we can't + * move it. + */ cp->cache_defrag->kmd_dont_know++; - if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); - cp->cache_defrag->kmd_hunt_found++; - kmem_slab_free_constructed(cp, callback->kmm_from_buf, - B_TRUE); - if (sp->slab_refcnt == 0) - cp->cache_defrag->kmd_slabs_freed++; - mutex_enter(&cp->cache_lock); - kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); - mutex_exit(&cp->cache_lock); - } break; default: panic("'%s' (%p) unexpected move callback response %d\n", @@ -4992,10 +4876,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); - if (callback == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + + if (callback == NULL) return (B_FALSE); - } callback->kmm_from_slab = sp; callback->kmm_from_buf = buf; @@ -5020,7 +4903,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) pending->kmm_flags |= KMM_DESPERATE; } mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_already_pending); kmem_cache_free(kmem_move_cache, callback); return (B_TRUE); } @@ -5034,7 +4916,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, callback, TQ_NOSLEEP)) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail); mutex_enter(&cp->cache_lock); avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); mutex_exit(&cp->cache_lock); @@ -5080,7 +4961,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); mutex_enter(&cp->cache_lock); } } @@ -5225,8 +5105,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * pending move completes. */ list_insert_head(deadlist, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_dead); return (-1); } @@ -5241,10 +5119,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_destroyed); mutex_enter(&cp->cache_lock); /* * Since we can't pick up the scan where we left @@ -5260,8 +5134,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * for the request and say nothing about the * number of reclaimable slabs. */ - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomem); return (-1); } @@ -5277,16 +5149,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * destination buffer on the same slab. In that * case, we're not interested in counting it. */ - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs), - kmem_move_stats.kms_endscan_refcnt_changed); return (-1); } - if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomove_changed); + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) return (-1); - } /* * Generating a move request allocates a destination @@ -5313,11 +5179,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, } end_scan: - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs) && - (sp == avl_first(&cp->cache_partial_slabs)), - kmem_move_stats.kms_endscan_freelist); - return (s); } @@ -5377,8 +5238,6 @@ kmem_cache_move_notify_task(void *arg) &cp->cache_defrag->kmd_moves_pending)) { list_insert_head(deadlist, sp); mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_dead); return; } @@ -5386,9 +5245,6 @@ kmem_cache_move_notify_task(void *arg) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_destroyed); return; } } else { @@ -5402,7 +5258,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf) { kmem_move_notify_args_t *args; - KMEM_STAT_ADD(kmem_move_stats.kms_notify); args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); if (args != NULL) { args->kmna_cache = cp; @@ -5425,7 +5280,6 @@ kmem_cache_defrag(kmem_cache_t *cp) n = avl_numnodes(&cp->cache_partial_slabs); if (n > 1) { /* kmem_move_buffers() drops and reacquires cache_lock */ - KMEM_STAT_ADD(kmem_move_stats.kms_defrags); cp->cache_defrag->kmd_defrags++; (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); } @@ -5524,7 +5378,6 @@ kmem_cache_scan(kmem_cache_t *cp) * * kmem_move_buffers() drops and reacquires cache_lock. */ - KMEM_STAT_ADD(kmem_move_stats.kms_scans); kmd->kmd_scans++; slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, kmem_reclaim_max_slabs, 0); @@ -5565,12 +5418,9 @@ kmem_cache_scan(kmem_cache_t *cp) if (!kmem_move_noreap && ((debug_rand % kmem_mtb_reap) == 0)) { mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps); kmem_cache_reap(cp); return; } else if ((debug_rand % kmem_mtb_move) == 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_scans); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans); kmd->kmd_scans++; (void) kmem_move_buffers(cp, kmem_reclaim_scan_range, 1, KMM_DEBUG); @@ -5581,8 +5431,6 @@ kmem_cache_scan(kmem_cache_t *cp) mutex_exit(&cp->cache_lock); - if (reap) { - KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); + if (reap) kmem_depot_ws_reap(cp); - } } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..cbc4fa0000 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2016, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index feb8e76c42..3aaf2c746c 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + if (state == TS_RUN) { /* * We set the new lwp running immediately. @@ -753,8 +781,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -827,8 +856,25 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then perform an implicit template_clear now + * since we're forking. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -836,21 +882,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -891,13 +949,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -941,6 +992,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1101,7 +1164,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7afc1cfe00..dda0b3e4a6 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)(uintptr_t)exec_fnamep, (const char **)(uintptr_t)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c index 928c6b3bb4..66994321f7 100644 --- a/usr/src/uts/common/os/msacct.c +++ b/usr/src/uts/common/os/msacct.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -566,27 +567,18 @@ cpu_update_pct(kthread_t *t, hrtime_t newtime) */ do { - if (T_ONPROC(t) && t->t_waitrq == 0) { - hrlb = t->t_hrtime; + pctcpu = t->t_pctcpu; + hrlb = t->t_hrtime; + delta = newtime - hrlb; + if (delta < 0) { + newtime = gethrtime_unscaled(); delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; + } + t->t_hrtime = newtime; + scalehrtime(&delta); + if (T_ONPROC(t) && t->t_waitrq == 0) { npctcpu = cpu_grow(pctcpu, delta); } else { - hrlb = t->t_hrtime; - delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; npctcpu = cpu_decay(pctcpu, delta); } } while (atomic_cas_32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu); diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -522,6 +535,20 @@ sprunlock(proc_t *p) THREAD_KPRI_RELEASE(); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + THREAD_KPRI_RELEASE(); +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 07bc2920da..d2bdb4ce37 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +55,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1243,6 +1244,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2078,6 +2095,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2581,3 +2605,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index a3cdaccc2a..cc1c5e03a6 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -372,6 +376,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..5ef12f3ae4 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,21 @@ issig_forreal(void) } /* + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + /* + * The brand hook will return 0 if it would like + * us to drive on, or -1 if we should restart + * the loop to check other conditions. + */ + if (BROP(p)->b_issig_stop(p, lwp) != 0) { + continue; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +688,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +740,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +754,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +986,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1107,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1213,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1241,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1376,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1856,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 62f94729cf..0a1406e0cd 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,7 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -77,6 +77,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3178,6 +3215,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 0d1bb6a8a1..aa44ccf788 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -1093,18 +1093,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1112,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1126,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1145,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1153,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1167,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1202,5 +1209,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index a554f8c3f3..0a6fe0ef96 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1618,7 +1618,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 706e5ed16f..072b0038f1 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent Inc. All rights reserved. + * Copyright 2015, Joyent Inc. All rights reserved. */ /* @@ -250,6 +250,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -370,8 +372,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -417,8 +423,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1377,6 +1384,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1671,6 +1786,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1764,6 +1912,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1792,7 +1954,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1818,6 +1980,230 @@ zone_kstat_create_common(zone_t *zone, char *name, } static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_mcap_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_mcap_kstat_t *zmp = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; + zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; + zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; + zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; + zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; + zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; + + return (0); +} + +static kstat_t * +zone_mcap_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_mcap_kstat_t *zmp; + + if ((ksp = kstat_create_zone("memory_cap", zone->zone_id, + zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED, + sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_mcap_lock; + zone->zone_mcap_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); + + ksp->ks_update = zone_mcap_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int zone_misc_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1846,6 +2232,11 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; + + zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; + return (0); } @@ -1884,7 +2275,10 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); - + kstat_named_init(&zmp->zm_nested_intp, "nested_interp", + KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; ksp->ks_private = zone; @@ -1896,13 +2290,30 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { + zone->zone_mcap_stats = kmem_zalloc( + sizeof (zone_mcap_kstat_t), KM_SLEEP); + } + if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { zone->zone_misc_stats = kmem_zalloc( sizeof (zone_misc_kstat_t), KM_SLEEP); @@ -1929,8 +2340,17 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_mcap_ksp, + sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, sizeof (zone_misc_kstat_t)); } @@ -1966,6 +2386,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -1989,8 +2411,9 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - + zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; @@ -2101,6 +2524,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2142,6 +2580,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2152,6 +2604,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2173,6 +2630,8 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2252,6 +2711,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2280,6 +2741,19 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) @@ -2324,12 +2798,18 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2337,7 +2817,7 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG @@ -2463,14 +2943,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - uint64_t mcap; - int err = 0; + zone->zone_mcap_nover++; + + return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ + uint64_t pageout; + int err; + + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} + +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -2882,6 +3413,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3625,6 +4162,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3653,9 +4201,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -4139,8 +4732,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4258,7 +4852,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4281,6 +4875,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zoneid = zone->zone_id = id_alloc(zoneid_space); + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4288,6 +4883,8 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4349,10 +4946,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4509,8 +5110,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4651,6 +5252,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4661,7 +5263,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5412,14 +6023,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5474,6 +6077,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5505,10 +6125,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5525,7 +6146,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5547,8 +6170,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); @@ -5577,6 +6209,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6269,6 +6917,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6279,7 +6928,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6380,6 +7029,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6557,7 +7207,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise |
