diff options
Diffstat (limited to 'usr/src/uts/common/os')
37 files changed, 2231 insertions, 911 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -47,6 +48,7 @@ #include <sys/time.h> #include <sys/msacct.h> #include <sys/zone.h> +#include <sys/brand.h> /* * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t) * On exit, write a record on the accounting file. */ void -acct(char st) +acct(int st) { struct vnode *vp; struct cred *cr; @@ -402,6 +404,21 @@ acct(char st) * This only gets called from exit after all lwp's have exited so no * cred locking is needed. */ + + /* If there is a brand-specific hook, use it instead */ + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { + ZBROP(curzone)->b_acct_out(vp, st); + mutex_exit(&ag->aclock); + return; + } + + /* + * The 'st' status value was traditionally masked this way by our + * caller, but we now accept the unmasked value for brand handling. + * Zones not using the brand hook mask the status here. + */ + st &= 0xff; + p = curproc; ua = PTOU(p); bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 0af67f5d98..62c3bbe2d6 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,15 +672,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp) int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, + char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. */ #include <sys/timer.h> @@ -41,6 +41,9 @@ static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; + /*ARGSUSED*/ static int clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts) static int clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { - /* - * CLOCK_HIGHRES timers of sufficiently high resolution can deny - * service; only allow privileged users to create such timers. - * Sites that do not wish to have this restriction should - * give users the "proc_clock_highres" privilege. - */ - if (secpolicy_clock_highres(CRED()) != 0) { - it->it_arg = NULL; - return (EPERM); - } - it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP); it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags, cpu_t *cpu; cpupart_t *cpupart; int pset; + boolean_t value_need_clamp = B_FALSE; + boolean_t intval_need_clamp = B_FALSE; + cred_t *cr = CRED(); + struct itimerspec clamped; + + /* + * CLOCK_HIGHRES timers of sufficiently high resolution can deny + * service; only allow privileged users to create such timers. + * Non-privileged users (those without the "proc_clock_highres" + * privilege) can create timers with lower resolution but if they + * attempt to use a very low time value (< 200us) then their + * timer will be clamped at 200us. + */ + if (when->it_value.tv_sec == 0 && + when->it_value.tv_nsec > 0 && + when->it_value.tv_nsec < clock_highres_interval_min) + value_need_clamp = B_TRUE; + + if (when->it_interval.tv_sec == 0 && + when->it_interval.tv_nsec > 0 && + when->it_interval.tv_nsec < clock_highres_interval_min) + intval_need_clamp = B_TRUE; + + if ((value_need_clamp || intval_need_clamp) && + secpolicy_clock_highres(cr) != 0) { + clamped.it_value.tv_sec = when->it_value.tv_sec; + clamped.it_interval.tv_sec = when->it_interval.tv_sec; + + if (value_need_clamp) { + clamped.it_value.tv_nsec = clock_highres_interval_min; + } else { + clamped.it_value.tv_nsec = when->it_value.tv_nsec; + } + + if (intval_need_clamp) { + clamped.it_interval.tv_nsec = + clock_highres_interval_min; + } else { + clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; + } + + when = &clamped; + } cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 909a6c2860..1a3502a710 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..437f26e6e0 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type) /* * Determine what rootvp to use. */ + mutex_enter(&curproc->p_lock); if (core_type == CORE_PROC) { rootvp = (PTOU(curproc)->u_rdir == NULL ? curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type) VN_HOLD(startvp); if (rootvp != rootdir) VN_HOLD(rootvp); + mutex_exit(&curproc->p_lock); if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp, startvp, CRED())) != 0) { pn_free(&pn); diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 87c0896814..9c7600ea24 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -108,7 +109,8 @@ kmutex_t cpu_lock; cpu_t *cpu_list; /* list of all CPUs */ cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ -static cpuset_t cpu_available; /* set of available CPUs */ +cpuset_t cpu_active_set; /* cached set of active CPUs */ +cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */ @@ -1724,6 +1726,7 @@ cpu_list_init(cpu_t *cp) cp->cpu_part = &cp_default; CPUSET_ADD(cpu_available, cp->cpu_id); + CPUSET_ADD(cpu_active_set, cp->cpu_id); } /* @@ -1895,6 +1898,7 @@ cpu_add_active_internal(cpu_t *cp) cp->cpu_prev_onln = cpu_active->cpu_prev_onln; cpu_active->cpu_prev_onln->cpu_next_onln = cp; cpu_active->cpu_prev_onln = cp; + CPUSET_ADD(cpu_active_set, cp->cpu_id); if (pp->cp_cpulist) { cp->cpu_next_part = pp->cp_cpulist; @@ -1965,6 +1969,7 @@ cpu_remove_active(cpu_t *cp) } cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; + CPUSET_DEL(cpu_active_set, cp->cpu_id); cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; @@ -2704,13 +2709,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind, return (0); } -#if CPUSET_WORDS > 1 -/* - * Functions for implementing cpuset operations when a cpuset is more - * than one word. On platforms where a cpuset is a single word these - * are implemented as macros in cpuvar.h. - */ +cpuset_t * +cpuset_alloc(int kmflags) +{ + return (kmem_alloc(sizeof (cpuset_t), kmflags)); +} + +void +cpuset_free(cpuset_t *s) +{ + kmem_free(s, sizeof (cpuset_t)); +} void cpuset_all(cpuset_t *s) @@ -2722,38 +2732,61 @@ cpuset_all(cpuset_t *s) } void -cpuset_all_but(cpuset_t *s, uint_t cpu) +cpuset_all_but(cpuset_t *s, const uint_t cpu) { cpuset_all(s); CPUSET_DEL(*s, cpu); } void -cpuset_only(cpuset_t *s, uint_t cpu) +cpuset_only(cpuset_t *s, const uint_t cpu) { CPUSET_ZERO(*s); CPUSET_ADD(*s, cpu); } +long +cpu_in_set(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + return (BT_TEST(s->cpub, cpu)); +} + +void +cpuset_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_SET(s->cpub, cpu); +} + +void +cpuset_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_CLEAR(s->cpub, cpu); +} + int cpuset_isnull(cpuset_t *s) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s->cpub[i] != 0) return (0); + } return (1); } int -cpuset_cmp(cpuset_t *s1, cpuset_t *s2) +cpuset_isequal(cpuset_t *s1, cpuset_t *s2) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s1->cpub[i] != s2->cpub[i]) return (0); + } return (1); } @@ -2822,7 +2855,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid) *smallestid = *largestid = CPUSET_NOTINSET; } -#endif /* CPUSET_WORDS */ +void +cpuset_atomic_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR(s->cpub, (cpu)) +} + +void +cpuset_atomic_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_SET(s->cpub, (cpu)) +} + +long +cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_SET_EXCL(s->cpub, cpu, res); + return (res); +} + +long +cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res); + return (res); +} + +void +cpuset_or(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] |= src->cpub[i]; + } +} + +void +cpuset_xor(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] ^= src->cpub[i]; + } +} + +void +cpuset_and(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] &= src->cpub[i]; + } +} + +void +cpuset_zero(cpuset_t *dst) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] = 0; + } +} + /* * Unbind threads bound to specified CPU. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 25727d54c5..0bd6cfd44f 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 0065b4945b..2ab4d1f023 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -98,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * These are consumed within the specific exec modules, but are defined here @@ -256,8 +257,10 @@ exec_common(const char *fname, const char **argp, const char **envp, * only if the pathname does not contain a "/" the resolved path * points to a file in the current working (attribute) directory. */ - if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && + mutex_enter(&p->p_lock); + if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 && strchr(resolvepn.pn_path, '/') == NULL) { + mutex_exit(&p->p_lock); if (dir != NULL) VN_RELE(dir); error = EACCES; @@ -266,6 +269,7 @@ exec_common(const char *fname, const char **argp, const char **envp, VN_RELE(vp); goto out; } + mutex_exit(&p->p_lock); bzero(exec_file, MAXCOMLEN+1); (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -313,14 +317,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -352,7 +385,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -376,6 +409,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -435,8 +470,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -560,7 +597,7 @@ gexec( long *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -881,8 +918,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1546,6 +1589,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1582,6 +1646,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1664,8 +1729,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1678,6 +1744,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1784,7 +1864,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1797,6 +1877,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1809,6 +1894,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1935,6 +2025,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 1b9359da47..3edddcf61f 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -230,7 +230,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -366,19 +366,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -390,12 +377,35 @@ proc_exit(int why, int what) if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); + + /* + * If the init process should be restarted, the + * "zone_restart_init" member will be set. Some init + * programs in branded zones do not tolerate a restart + * in the traditional manner; setting the + * "zone_reboot_on_init_exit" member will cause the + * entire zone to be rebooted instead. If neither of + * these flags is set the zone will shut down. + */ + if (z->zone_reboot_on_init_exit == B_TRUE && + z->zone_restart_init == B_TRUE) { + /* + * Trigger a zone reboot and continue + * with exit processing. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } + + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); + zone_kcred()); } } @@ -407,6 +417,32 @@ proc_exit(int why, int what) z->zone_proc_initpid = -1; } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -565,7 +601,7 @@ proc_exit(int why, int what) semexit(p); rv = wstat(why, what); - acct(rv & 0xff); + acct(rv); exacct_commit_proc(p, rv); /* @@ -658,10 +694,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -673,7 +721,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -847,8 +896,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; - proc_gone = 0; + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } + + if (pp->p_child == NULL) { + goto no_real_children; + } + } + + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1354,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..41e7e63d2b 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -386,6 +386,7 @@ flist_grow(int maxfd) dst->uf_flag = src->uf_flag; dst->uf_busy = src->uf_busy; dst->uf_portfd = src->uf_portfd; + dst->uf_gen = src->uf_gen; } /* @@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */ afd->a_fd[i] = -1; } -static void +void set_active_fd(int fd) { afd_t *afd = &curthread->t_activefd; @@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd) } /* - * Convert a user supplied file descriptor into a pointer to a file - * structure. Only task is to check range of the descriptor (soft - * resource limit was enforced at open time and shouldn't be checked - * here). + * Convert a user supplied file descriptor into a pointer to a file structure. + * Only task is to check range of the descriptor (soft resource limit was + * enforced at open time and shouldn't be checked here). */ file_t * -getf(int fd) +getf_gen(int fd, uf_entry_gen_t *genp) { uf_info_t *fip = P_FINFO(curproc); uf_entry_t *ufp; @@ -607,6 +607,9 @@ getf(int fd) return (NULL); } ufp->uf_refcnt++; + if (genp != NULL) { + *genp = ufp->uf_gen; + } set_active_fd(fd); /* record the active file descriptor */ @@ -615,6 +618,12 @@ getf(int fd) return (fp); } +file_t * +getf(int fd) +{ + return (getf_gen(fd, NULL)); +} + /* * Close whatever file currently occupies the file descriptor slot * and install the new file, usually NULL, in the file descriptor slot. @@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp) ASSERT(ufp->uf_flag == 0); fd_reserve(fip, fd, 1); ufp->uf_file = newfp; + ufp->uf_gen++; UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (0); @@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { @@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) cufp->uf_alloc = pufp->uf_alloc; cufp->uf_flag = pufp->uf_flag; cufp->uf_busy = pufp->uf_busy; + cufp->uf_gen = pufp->uf_gen; if (pufp->uf_file == NULL) { ASSERT(pufp->uf_flag == 0); if (pufp->uf_busy) { @@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp) fd_reserve(fip, fd, 1); ASSERT(ufp->uf_file == NULL); ufp->uf_file = fp; + if (fp != NULL) { + ufp->uf_gen++; + } UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (fd); @@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp) } else { UF_ENTER(ufp, fip, fd); ASSERT(ufp->uf_busy); + ufp->uf_gen++; } ASSERT(ufp->uf_fpollinfo == NULL); ASSERT(ufp->uf_flag == 0); @@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp) error = EBADF; else { vnode_t *vp = fp->f_vnode; - int flag = fp->f_flag | - ((fp->f_flag2 & ~FEPOLLED) << 16); + int flag = fp->f_flag | (fp->f_flag2 << 16); /* * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index a63931459f..7e198910b4 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -705,7 +706,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -754,7 +755,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -791,6 +792,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); @@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) */ fcnt_add(P_FINFO(pp), 1); + mutex_enter(&pp->p_lock); if (PTOU(pp)->u_cdir) { VN_HOLD(PTOU(pp)->u_cdir); } else { @@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd) refstr_hold(PTOU(pp)->u_cwd); + mutex_exit(&pp->p_lock); /* * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index 647bca2542..a3de80259f 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -19,7 +19,10 @@ * CDDL HEADER END */ -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. @@ -52,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -540,6 +544,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -555,8 +573,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -565,9 +581,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -750,8 +765,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -759,10 +772,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 9381019cd1..6a6f5d84ef 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index 9187b4bdeb..1243d0fbbf 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -159,10 +160,22 @@ * find known objects and is about to free it, or * c) the client has freed the object. * In all these cases (a, b, and c) kmem frees the new object (the - * unused copy destination) and searches for the old object in the - * magazine layer. If found, the object is removed from the magazine - * layer and freed to the slab layer so it will no longer hold the - * slab hostage. + * unused copy destination). In the first case, the object is in + * use and the correct action is that for LATER; in the latter two + * cases, we know that the object is either freed or about to be + * freed, in which case it is either already in a magazine or about + * to be in one. In these cases, we know that the object will either + * be reallocated and reused, or it will end up in a full magazine + * that will be reaped (thereby liberating the slab). Because it + * is prohibitively expensive to differentiate these cases, and + * because the defrag code is executed when we're low on memory + * (thereby biasing the system to reclaim full magazines) we treat + * all DONT_KNOW cases as LATER and rely on cache reaping to + * generally clean up full magazines. While we take the same action + * for these cases, we maintain their semantic distinction: if + * defragmentation is not occurring, it is useful to know if this + * is due to objects in use (LATER) or objects in an unknown state + * of transition (DONT_KNOW). * * 2.3 Object States * @@ -285,10 +298,10 @@ * view of the slab layer, making it a candidate for the move callback. Most * objects unrecognized by the client in the move callback fall into this * category and are cheaply distinguished from known objects by the test - * described earlier. Since recognition is cheap for the client, and searching - * magazines is expensive for kmem, kmem defers searching until the client first - * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem - * elsewhere does what it can to avoid bothering the client unnecessarily. + * described earlier. Because searching magazines is prohibitively expensive + * for kmem, clients that do not mark freed objects (and therefore return + * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation + * efficacy reduced. * * Invalidating the designated pointer member before freeing the object marks * the object to be avoided in the callback, and conversely, assigning a valid @@ -998,6 +1011,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ size_t kmem_content_log_size; /* content log size [2% of memory] */ size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1005,6 +1019,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ + #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ #else @@ -1038,21 +1060,7 @@ static vmem_t *kmem_default_arena; static vmem_t *kmem_firewall_va_arena; static vmem_t *kmem_firewall_arena; -/* - * Define KMEM_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define KMEM_STATS -#endif /* DEBUG */ - -#ifdef KMEM_STATS -#define KMEM_STAT_ADD(stat) ((stat)++) -#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) -#else -#define KMEM_STAT_ADD(stat) /* nothing */ -#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */ -#endif /* KMEM_STATS */ +static int kmem_zerosized; /* # of zero-sized allocs */ /* * kmem slab consolidator thresholds (tunables) @@ -1071,47 +1079,6 @@ size_t kmem_reclaim_max_slabs = 1; */ size_t kmem_reclaim_scan_range = 12; -#ifdef KMEM_STATS -static struct { - uint64_t kms_callbacks; - uint64_t kms_yes; - uint64_t kms_no; - uint64_t kms_later; - uint64_t kms_dont_need; - uint64_t kms_dont_know; - uint64_t kms_hunt_found_mag; - uint64_t kms_hunt_found_slab; - uint64_t kms_hunt_alloc_fail; - uint64_t kms_hunt_lucky; - uint64_t kms_notify; - uint64_t kms_notify_callbacks; - uint64_t kms_disbelief; - uint64_t kms_already_pending; - uint64_t kms_callback_alloc_fail; - uint64_t kms_callback_taskq_fail; - uint64_t kms_endscan_slab_dead; - uint64_t kms_endscan_slab_destroyed; - uint64_t kms_endscan_nomem; - uint64_t kms_endscan_refcnt_changed; - uint64_t kms_endscan_nomove_changed; - uint64_t kms_endscan_freelist; - uint64_t kms_avl_update; - uint64_t kms_avl_noupdate; - uint64_t kms_no_longer_reclaimable; - uint64_t kms_notify_no_longer_reclaimable; - uint64_t kms_notify_slab_dead; - uint64_t kms_notify_slab_destroyed; - uint64_t kms_alloc_fail; - uint64_t kms_constructor_fail; - uint64_t kms_dead_slabs_freed; - uint64_t kms_defrags; - uint64_t kms_scans; - uint64_t kms_scan_depot_ws_reaps; - uint64_t kms_debug_reaps; - uint64_t kms_debug_scans; -} kmem_move_stats; -#endif /* KMEM_STATS */ - /* consolidator knobs */ boolean_t kmem_move_noreap; boolean_t kmem_move_blocked; @@ -1142,6 +1109,7 @@ kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; kmem_log_header_t *kmem_slab_log; +kmem_log_header_t *kmem_zerosized_log; static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -1922,15 +1890,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf) cp->cache_complete_slab_count--; avl_add(&cp->cache_partial_slabs, sp); } else { -#ifdef DEBUG - if (avl_update_gt(&cp->cache_partial_slabs, sp)) { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); - } else { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); - } -#else (void) avl_update_gt(&cp->cache_partial_slabs, sp); -#endif } ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == @@ -2964,8 +2924,33 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we log to the kmem_zerosized_log that + * that this condition has occurred (which gives us + * enough information to be able to debug it). + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0; " + "see kmem_zerosized_log for details"); + } + + kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -3579,7 +3564,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw) kmcp->kmc_move_later.value.ui64 = kd->kmd_later; kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; - kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found; + kmcp->kmc_move_hunt_found.value.ui64 = 0; kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; kmcp->kmc_scan.value.ui64 = kd->kmd_scans; @@ -4150,7 +4135,8 @@ kmem_cache_destroy(kmem_cache_t *cp) if (kmem_taskq != NULL) taskq_wait(kmem_taskq); - if (kmem_move_taskq != NULL) + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) taskq_wait(kmem_move_taskq); kmem_cache_magazine_purge(cp); @@ -4488,8 +4474,8 @@ kmem_init(void) } kmem_failure_log = kmem_log_init(kmem_failure_log_size); - kmem_slab_log = kmem_log_init(kmem_slab_log_size); + kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size); /* * Initialize STREAMS message caches so allocb() is available. @@ -4677,94 +4663,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); } -static void * -kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, - void *tbuf) -{ - int i; /* magazine round index */ - - for (i = 0; i < n; i++) { - if (buf == m->mag_round[i]) { - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, tbuf, - caller()); - } - m->mag_round[i] = tbuf; - return (buf); - } - } - - return (NULL); -} - -/* - * Hunt the magazine layer for the given buffer. If found, the buffer is - * removed from the magazine layer and returned, otherwise NULL is returned. - * The state of the returned buffer is freed and constructed. - */ -static void * -kmem_hunt_mags(kmem_cache_t *cp, void *buf) -{ - kmem_cpu_cache_t *ccp; - kmem_magazine_t *m; - int cpu_seqid; - int n; /* magazine rounds */ - void *tbuf; /* temporary swap buffer */ - - ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); - - /* - * Allocated a buffer to swap with the one we hope to pull out of a - * magazine when found. - */ - tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); - if (tbuf == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); - return (NULL); - } - if (tbuf == buf) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, buf, caller()); - } - return (buf); - } - - /* Hunt the depot. */ - mutex_enter(&cp->cache_depot_lock); - n = cp->cache_magtype->mt_magsize; - for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&cp->cache_depot_lock); - return (buf); - } - } - mutex_exit(&cp->cache_depot_lock); - - /* Hunt the per-CPU magazines. */ - for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { - ccp = &cp->cache_cpu[cpu_seqid]; - - mutex_enter(&ccp->cc_lock); - m = ccp->cc_loaded; - n = ccp->cc_rounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - m = ccp->cc_ploaded; - n = ccp->cc_prounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - mutex_exit(&ccp->cc_lock); - } - - kmem_cache_free(cp, tbuf); - return (NULL); -} - /* * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), * or when the buffer is freed. @@ -4828,7 +4726,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *); * NO kmem frees the new buffer, marks the slab of the old buffer * non-reclaimable to avoid bothering the client again * LATER kmem frees the new buffer, increments slab_later_count - * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer + * DONT_KNOW kmem frees the new buffer * DONT_NEED kmem frees both the old buffer and the new buffer * * The pending callback argument now being processed contains both of the @@ -4862,19 +4760,14 @@ kmem_move_buffer(kmem_move_t *callback) * another buffer on the same slab. */ if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { - KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_no_longer_reclaimable); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } /* - * Hunting magazines is expensive, so we'll wait to do that until the - * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer - * is cheap, so we might as well do that here in case we can avoid - * bothering the client. + * Checking the slab layer is easy, so we might as well do that here + * in case we can avoid bothering the client. */ mutex_enter(&cp->cache_lock); free_on_slab = (kmem_slab_allocated(cp, sp, @@ -4882,7 +4775,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); if (free_on_slab) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; @@ -4894,7 +4786,6 @@ kmem_move_buffer(kmem_move_t *callback) */ if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, KM_NOSLEEP, 1, caller()) != 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail); kmem_move_end(cp, callback); return; } @@ -4902,15 +4793,11 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, KM_NOSLEEP) != 0) { atomic_inc_64(&cp->cache_alloc_fail); - KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } - KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_callbacks); cp->cache_defrag->kmd_callbacks++; cp->cache_defrag->kmd_thread = curthread; cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; @@ -4928,7 +4815,6 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_defrag->kmd_to_buf = NULL; if (response == KMEM_CBRC_YES) { - KMEM_STAT_ADD(kmem_move_stats.kms_yes); cp->cache_defrag->kmd_yes++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); /* slab safe to access until kmem_move_end() */ @@ -4943,14 +4829,12 @@ kmem_move_buffer(kmem_move_t *callback) switch (response) { case KMEM_CBRC_NO: - KMEM_STAT_ADD(kmem_move_stats.kms_no); cp->cache_defrag->kmd_no++; mutex_enter(&cp->cache_lock); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_LATER: - KMEM_STAT_ADD(kmem_move_stats.kms_later); cp->cache_defrag->kmd_later++; mutex_enter(&cp->cache_lock); if (!KMEM_SLAB_IS_PARTIAL(sp)) { @@ -4959,7 +4843,6 @@ kmem_move_buffer(kmem_move_t *callback) } if (++sp->slab_later_count >= KMEM_DISBELIEF) { - KMEM_STAT_ADD(kmem_move_stats.kms_disbelief); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, @@ -4968,7 +4851,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_NEED: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_need); cp->cache_defrag->kmd_dont_need++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); if (sp->slab_refcnt == 0) @@ -4978,19 +4860,21 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_KNOW: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); + /* + * If we don't know if we can move this buffer or not, we'll + * just assume that we can't: if the buffer is in fact free, + * then it is sitting in one of the per-CPU magazines or in + * a full magazine in the depot layer. Either way, because + * defrag is induced in the same logic that reaps a cache, + * it's likely that full magazines will be returned to the + * system soon (thereby accomplishing what we're trying to + * accomplish here: return those magazines to their slabs). + * Given this, any work that we might do now to locate a buffer + * in a magazine is wasted (and expensive!) work; we bump + * a counter in this case and otherwise assume that we can't + * move it. + */ cp->cache_defrag->kmd_dont_know++; - if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); - cp->cache_defrag->kmd_hunt_found++; - kmem_slab_free_constructed(cp, callback->kmm_from_buf, - B_TRUE); - if (sp->slab_refcnt == 0) - cp->cache_defrag->kmd_slabs_freed++; - mutex_enter(&cp->cache_lock); - kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); - mutex_exit(&cp->cache_lock); - } break; default: panic("'%s' (%p) unexpected move callback response %d\n", @@ -5015,10 +4899,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); - if (callback == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + + if (callback == NULL) return (B_FALSE); - } callback->kmm_from_slab = sp; callback->kmm_from_buf = buf; @@ -5043,7 +4926,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) pending->kmm_flags |= KMM_DESPERATE; } mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_already_pending); kmem_cache_free(kmem_move_cache, callback); return (B_TRUE); } @@ -5057,7 +4939,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, callback, TQ_NOSLEEP)) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail); mutex_enter(&cp->cache_lock); avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); mutex_exit(&cp->cache_lock); @@ -5103,7 +4984,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); mutex_enter(&cp->cache_lock); } } @@ -5248,8 +5128,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * pending move completes. */ list_insert_head(deadlist, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_dead); return (-1); } @@ -5264,10 +5142,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_destroyed); mutex_enter(&cp->cache_lock); /* * Since we can't pick up the scan where we left @@ -5283,8 +5157,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * for the request and say nothing about the * number of reclaimable slabs. */ - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomem); return (-1); } @@ -5300,16 +5172,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * destination buffer on the same slab. In that * case, we're not interested in counting it. */ - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs), - kmem_move_stats.kms_endscan_refcnt_changed); return (-1); } - if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomove_changed); + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) return (-1); - } /* * Generating a move request allocates a destination @@ -5336,11 +5202,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, } end_scan: - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs) && - (sp == avl_first(&cp->cache_partial_slabs)), - kmem_move_stats.kms_endscan_freelist); - return (s); } @@ -5400,8 +5261,6 @@ kmem_cache_move_notify_task(void *arg) &cp->cache_defrag->kmd_moves_pending)) { list_insert_head(deadlist, sp); mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_dead); return; } @@ -5409,9 +5268,6 @@ kmem_cache_move_notify_task(void *arg) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_destroyed); return; } } else { @@ -5425,7 +5281,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf) { kmem_move_notify_args_t *args; - KMEM_STAT_ADD(kmem_move_stats.kms_notify); args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); if (args != NULL) { args->kmna_cache = cp; @@ -5448,7 +5303,6 @@ kmem_cache_defrag(kmem_cache_t *cp) n = avl_numnodes(&cp->cache_partial_slabs); if (n > 1) { /* kmem_move_buffers() drops and reacquires cache_lock */ - KMEM_STAT_ADD(kmem_move_stats.kms_defrags); cp->cache_defrag->kmd_defrags++; (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); } @@ -5547,7 +5401,6 @@ kmem_cache_scan(kmem_cache_t *cp) * * kmem_move_buffers() drops and reacquires cache_lock. */ - KMEM_STAT_ADD(kmem_move_stats.kms_scans); kmd->kmd_scans++; slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, kmem_reclaim_max_slabs, 0); @@ -5588,12 +5441,9 @@ kmem_cache_scan(kmem_cache_t *cp) if (!kmem_move_noreap && ((debug_rand % kmem_mtb_reap) == 0)) { mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps); kmem_cache_reap(cp); return; } else if ((debug_rand % kmem_mtb_move) == 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_scans); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans); kmd->kmd_scans++; (void) kmem_move_buffers(cp, kmem_reclaim_scan_range, 1, KMM_DEBUG); @@ -5604,8 +5454,6 @@ kmem_cache_scan(kmem_cache_t *cp) mutex_exit(&cp->cache_lock); - if (reap) { - KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); + if (reap) kmem_depot_ws_reap(cp); - } } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..227efe9190 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2017, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif @@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc) mblk_t * log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg, - size_t size, int on_intr) + size_t size, int on_intr) { mblk_t *mp = NULL; mblk_t *mp2; diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index feb8e76c42..341e4ae356 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,15 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + + lwp_fp_init(lwp); + if (state == TS_RUN) { /* * We set the new lwp running immediately. @@ -753,8 +783,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -827,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -836,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -891,13 +953,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -941,6 +996,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1101,7 +1168,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7afc1cfe00..dda0b3e4a6 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)(uintptr_t)exec_fnamep, (const char **)(uintptr_t)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 142c10754e..0410e6f47b 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -522,6 +535,20 @@ sprunlock(proc_t *p) THREAD_KPRI_RELEASE(); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + THREAD_KPRI_RELEASE(); +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index d6821c83b0..8cc7f009a3 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -56,6 +56,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bc1787c9ca..854fb602da 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 09b80323d5..e0a1126567 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids; kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */ kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; + kmutex_t rctl_lists_lock; rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2872,12 +2875,12 @@ rctl_init(void) * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, * int chargeproc) * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects. * * Return values * 0 - success @@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(p != NULL); ASSERT(MUTEX_HELD(&p->p_lock)); + if (proj != NULL) { projp = proj; zonep = proj->kpj_zone; @@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, } } - zonep->zone_locked_mem += inc; - projp->kpj_data.kpd_locked_mem += inc; if (chargeproc != 0) { + /* Check for overflow */ + if ((p->p_locked_mem + inc) < p->p_locked_mem) { + ret = EAGAIN; + goto out; + } + if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, + &e, inc, 0) & RCT_DENY) { + ret = EAGAIN; + goto out; + } + p->p_locked_mem += inc; } + + zonep->zone_locked_mem += inc; + projp->kpj_data.kpd_locked_mem += inc; out: mutex_exit(&zonep->zone_mem_lock); return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/port_kernel.h> #include <sys/signal.h> #include <sys/var.h> +#include <sys/policy.h> #include <sys/vmparam.h> #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl; rctl_hndl_t rc_process_semopm; rctl_hndl_t rc_process_portev; rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem; /* * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = { }; /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, + struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ + if (secpolicy_lock_memory(CRED()) == 0) + return (0); + return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { + rcop_no_action, + rcop_no_usage, + rcop_no_set, + proc_maxlockedmem_test +}; + +/* * void rctlproc_default_init() * * Overview @@ -383,6 +406,11 @@ rctlproc_init(void) rctl_add_default_limit("process.max-sigqueue-size", _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); + rc_process_maxlockedmem = rctl_register("process.max-locked-memory", + RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, + ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); + /* * Place minimal set of controls on "sched" process for inheritance by * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 5721083751..18b396a765 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t) /* - * If the sc_sigblock field is set for the specified thread, set - * its signal mask to block all maskable signals, then clear the - * sc_sigblock field. This finishes what user-level code requested - * to be done when it set tdp->sc_shared->sc_sigblock non-zero. - * Called from signal-related code either by the current thread for - * itself or by a thread that holds the process's p_lock (/proc code). + * If the sc_sigblock field is set for the specified thread, set its signal + * mask to block all maskable signals, then clear the sc_sigblock field. This + * accomplishes what user-level code requested to be done when it set + * tdp->sc_shared->sc_sigblock non-zero. + * + * This is generally called by signal-related code in the current thread. In + * order to call against a thread other than curthread, p_lock for the + * containing process must be held. Even then, the caller is not protected + * from races with the thread in question updating its own fields. It is the + * responsibility of the caller to perform additional synchronization. + * */ void schedctl_finish_sigblock(kthread_t *t) diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..5ef12f3ae4 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,21 @@ issig_forreal(void) } /* + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + /* + * The brand hook will return 0 if it would like + * us to drive on, or -1 if we should restart + * the loop to check other conditions. + */ + if (BROP(p)->b_issig_stop(p, lwp) != 0) { + continue; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +688,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +740,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +754,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +986,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1107,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1213,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1241,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1376,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1856,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 62f94729cf..21ec25b5b3 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,7 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -77,6 +77,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3178,6 +3215,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } @@ -8174,12 +8212,8 @@ out: * an M_PROTO/M_PCPROTO part). */ int -strpoll( - struct stdata *stp, - short events_arg, - int anyyet, - short *reventsp, - struct pollhead **phpp) +strpoll(struct stdata *stp, short events_arg, int anyyet, short *reventsp, + struct pollhead **phpp) { int events = (ushort_t)events_arg; int retevents = 0; @@ -8316,8 +8350,7 @@ chkrd: retevents |= (events & (POLLIN | POLLRDBAND)); break; } - if (! (retevents & normevents) && - (stp->sd_wakeq & RSLEEP)) { + if (!(retevents & normevents) && (stp->sd_wakeq & RSLEEP)) { /* * Sync stream barrier read queue has data. */ @@ -8328,19 +8361,11 @@ chkrd: retevents |= normevents; } - *reventsp = (short)retevents; - if (retevents && !(events & POLLET)) { - if (headlocked) - mutex_exit(&stp->sd_lock); - return (0); - } - /* - * If poll() has not found any events yet, set up event cell - * to wake up the poll if a requested event occurs on this - * stream. Check for collisions with outstanding poll requests. + * Pass back a pollhead if no events are pending or if edge-triggering + * has been configured on this resource. */ - if (!anyyet) { + if ((retevents == 0 && !anyyet) || (events & POLLET)) { *phpp = &stp->sd_pollist; if (headlocked == 0) { if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) { @@ -8351,6 +8376,8 @@ chkrd: } stp->sd_rput_opt |= SR_POLLIN; } + + *reventsp = (short)retevents; if (headlocked) mutex_exit(&stp->sd_lock); return (0); diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index ede7da413b..b1727729de 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5903,6 +5903,12 @@ ddi_ffs(long mask) return (ffs(mask)); } +int +ddi_ffsll(long long mask) +{ + return (ffs(mask)); +} + /* * Find last bit set. Take mask and clear * all but the most significant bit, and @@ -5914,8 +5920,14 @@ ddi_ffs(long mask) int ddi_fls(long mask) { + return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{ while (mask) { - long nx; + long long nx; if ((nx = (mask & (mask - 1))) == 0) break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index b3861dec03..4dd48c7e19 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,7 +23,7 @@ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -61,8 +61,7 @@ struct mmaplf32a; int access(char *, int); int alarm(int); int auditsys(struct auditcalls *, rval_t *); -int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t); +int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); intptr_t brk(caddr_t); int chdir(char *); int chmod(char *, int); @@ -647,7 +646,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1002,7 +1001,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1094,18 +1093,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1113,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1127,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1146,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1154,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1168,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1203,5 +1209,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index b25a6cbcf1..5453ebf380 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,11 +25,12 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/timer.h> #include <sys/systm.h> +#include <sys/sysmacros.h> #include <sys/param.h> #include <sys/kmem.h> #include <sys/debug.h> @@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it) * waiters. p_lock must be held on entry; it will not be dropped by * timer_unlock(). */ +/* ARGSUSED */ static void timer_unlock(proc_t *p, itimer_t *it) { @@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) timer_lock(p, it); } + ASSERT(p->p_itimer_sz > tid); ASSERT(p->p_itimer[tid] == it); p->p_itimer[tid] = NULL; @@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) it->it_backend->clk_timer_delete(it); - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portev) { port_kevent_t *pev; @@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) static itimer_t * timer_grab(proc_t *p, timer_t tid) { - itimer_t **itp, *it; + itimer_t *it; - if (tid >= timer_max || tid < 0) + if (tid < 0) { return (NULL); + } mutex_enter(&p->p_lock); - - if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) { + if (p->p_itimer == NULL || tid >= p->p_itimer_sz || + (it = p->p_itimer[tid]) == NULL) { mutex_exit(&p->p_lock); return (NULL); } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (it->it_lock & ITLK_REMOVE) { @@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid) * should not be held on entry; timer_release() will acquire p_lock but * will drop it before returning. */ -static void +void timer_release(proc_t *p, itimer_t *it) { mutex_enter(&p->p_lock); @@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it) * p_lock should not be held on entry; timer_delete_grabbed() will acquire * p_lock, but will drop it before returning. */ -static void +void timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it) { mutex_enter(&p->p_lock); @@ -258,6 +263,13 @@ clock_timer_init() { clock_timer_cache = kmem_cache_create("timer_cache", sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Push the timer_max limit up to at least 4 * NCPU. Due to the way + * NCPU is defined, proper initialization of the timer limit is + * performed at runtime. + */ + timer_max = MAX(NCPU * 4, timer_max); } void @@ -453,6 +465,9 @@ timer_fire(itimer_t *it) it->it_pending = 1; port_send_event((port_kevent_t *)it->it_portev); mutex_exit(&it->it_mutex); + } else if (it->it_flags & IT_CALLBACK) { + it->it_cb_func(it); + ASSERT(MUTEX_NOT_HELD(&it->it_mutex)); } else if (it->it_flags & IT_SIGNAL) { it->it_pending = 1; mutex_exit(&it->it_mutex); @@ -466,159 +481,175 @@ timer_fire(itimer_t *it) mutex_exit(&p->p_lock); } -int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +/* + * Allocate an itimer_t and find and appropriate slot for it in p_itimer. + * Acquires p_lock and holds it on return, regardless of success. + */ +static itimer_t * +timer_alloc(proc_t *p, timer_t *id) { - struct sigevent ev; - proc_t *p = curproc; - clock_backend_t *backend; - itimer_t *it, **itp; - sigqueue_t *sigq; - cred_t *cr = CRED(); - int error = 0; - timer_t i; - port_notify_t tim_pnevp; - port_kevent_t *pkevp = NULL; + itimer_t *it, **itp = NULL; + uint_t i; - if ((backend = CLOCK_BACKEND(clock)) == NULL) - return (set_errno(EINVAL)); + ASSERT(MUTEX_NOT_HELD(&p->p_lock)); - if (evp != NULL) { - /* - * short copyin() for binary compatibility - * fetch oldsigevent to determine how much to copy in. - */ - if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(evp, &ev, sizeof (struct oldsigevent))) - return (set_errno(EFAULT)); + it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); + bzero(it, sizeof (itimer_t)); + mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, - sizeof (port_notify_t))) - return (set_errno(EFAULT)); + mutex_enter(&p->p_lock); +retry: + if (p->p_itimer != NULL) { + for (i = 0; i < p->p_itimer_sz; i++) { + if (p->p_itimer[i] == NULL) { + itp = &(p->p_itimer[i]); + break; } -#ifdef _SYSCALL32_IMPL - } else { - struct sigevent32 ev32; - port_notify32_t tim_pnevp32; + } + } - if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) - return (set_errno(EFAULT)); - ev.sigev_notify = ev32.sigev_notify; - ev.sigev_signo = ev32.sigev_signo; + /* + * A suitable slot was not found. If possible, allocate (or resize) + * the p_itimer array and try again. + */ + if (itp == NULL) { + uint_t target_sz = _TIMER_ALLOC_INIT; + itimer_t **itp_new; + + if (p->p_itimer != NULL) { + ASSERT(p->p_itimer_sz != 0); + + target_sz = p->p_itimer_sz * 2; + } + /* + * Protect against exceeding the max or overflow + */ + if (target_sz > timer_max || target_sz > INT_MAX || + target_sz < p->p_itimer_sz) { + kmem_cache_free(clock_timer_cache, it); + return (NULL); + } + mutex_exit(&p->p_lock); + itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), + KM_SLEEP); + mutex_enter(&p->p_lock); + if (target_sz <= p->p_itimer_sz) { /* - * See comment in sigqueue32() on handling of 32-bit - * sigvals in a 64-bit kernel. + * A racing thread performed the resize while we were + * waiting outside p_lock. Discard our now-useless + * allocation and retry. */ - ev.sigev_value.sival_int = ev32.sigev_value.sival_int; - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin((void *)(uintptr_t) - ev32.sigev_value.sival_ptr, - (void *)&tim_pnevp32, - sizeof (port_notify32_t))) - return (set_errno(EFAULT)); - tim_pnevp.portnfy_port = - tim_pnevp32.portnfy_port; - tim_pnevp.portnfy_user = - (void *)(uintptr_t)tim_pnevp32.portnfy_user; + kmem_free(itp_new, target_sz * sizeof (itimer_t *)); + goto retry; + } else { + /* + * Instantiate the larger allocation and select the + * first fresh entry for use. + */ + if (p->p_itimer != NULL) { + uint_t old_sz; + + old_sz = p->p_itimer_sz; + bcopy(p->p_itimer, itp_new, + old_sz * sizeof (itimer_t *)); + kmem_free(p->p_itimer, + old_sz * sizeof (itimer_t *)); + + /* + * Short circuit to use the first free entry in + * the new allocation. It's possible that + * other lower-indexed timers were freed while + * p_lock was dropped, but skipping over them + * is not harmful at all. In the common case, + * we skip the need to walk over an array + * filled with timers before arriving at the + * slot we know is fresh from the allocation. + */ + i = old_sz; + } else { + /* + * For processes lacking any existing timers, + * we can simply select the first entry. + */ + i = 0; } -#endif + p->p_itimer = itp_new; + p->p_itimer_sz = target_sz; } - switch (ev.sigev_notify) { - case SIGEV_NONE: - break; - case SIGEV_SIGNAL: - if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) - return (set_errno(EINVAL)); - break; - case SIGEV_THREAD: - case SIGEV_PORT: - break; - default: - return (set_errno(EINVAL)); - } - } else { - /* - * Use the clock's default sigevent (this is a structure copy). - */ - ev = backend->clk_default; } + ASSERT(i <= INT_MAX); + *id = (timer_t)i; + return (it); +} + +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend. Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete(). This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ +int +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, + itimer_t **itp, timer_t *tidp) +{ + proc_t *p = curproc; + int error = 0; + itimer_t *it; + sigqueue_t *sigq; + timer_t tid; + /* - * We'll allocate our timer and sigqueue now, before we grab p_lock. - * If we can't find an empty slot, we'll free them before returning. + * We'll allocate our sigqueue now, before we grab p_lock. + * If we can't find an empty slot, we'll free it before returning. */ - it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); - bzero(it, sizeof (itimer_t)); - mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); - mutex_enter(&p->p_lock); - /* - * If this is this process' first timer, we need to attempt to allocate - * an array of timerstr_t pointers. We drop p_lock to perform the - * allocation; if we return to discover that p_itimer is non-NULL, - * we will free our allocation and drive on. + * Allocate a timer and choose a slot for it. This acquires p_lock. */ - if ((itp = p->p_itimer) == NULL) { - mutex_exit(&p->p_lock); - itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP); - mutex_enter(&p->p_lock); - - if (p->p_itimer == NULL) - p->p_itimer = itp; - else { - kmem_free(itp, timer_max * sizeof (itimer_t *)); - itp = p->p_itimer; - } - } - - for (i = 0; i < timer_max && itp[i] != NULL; i++) - continue; + it = timer_alloc(p, &tid); + ASSERT(MUTEX_HELD(&p->p_lock)); - if (i == timer_max) { - /* - * We couldn't find a slot. Drop p_lock, free the preallocated - * timer and sigqueue, and return an error. - */ + if (it == NULL) { mutex_exit(&p->p_lock); - kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - - return (set_errno(EAGAIN)); + return (EAGAIN); } - ASSERT(i < timer_max && itp[i] == NULL); + ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); + ASSERT(evp != NULL); /* * If we develop other notification mechanisms, this will need * to call into (yet another) backend. */ - sigq->sq_info.si_signo = ev.sigev_signo; - if (evp == NULL) - sigq->sq_info.si_value.sival_int = i; - else - sigq->sq_info.si_value = ev.sigev_value; + sigq->sq_info.si_signo = evp->sigev_signo; + sigq->sq_info.si_value = evp->sigev_value; sigq->sq_info.si_code = SI_TIMER; sigq->sq_info.si_pid = p->p_pid; sigq->sq_info.si_ctid = PRCTID(p); sigq->sq_info.si_zoneid = getzoneid(); - sigq->sq_info.si_uid = crgetruid(cr); + sigq->sq_info.si_uid = crgetruid(CRED()); sigq->sq_func = timer_signal; sigq->sq_next = NULL; sigq->sq_backptr = it; it->it_sigq = sigq; it->it_backend = backend; it->it_lock = ITLK_LOCKED; - itp[i] = it; - - if (ev.sigev_notify == SIGEV_THREAD || - ev.sigev_notify == SIGEV_PORT) { + if (evp->sigev_notify == SIGEV_THREAD || + evp->sigev_notify == SIGEV_PORT) { int port; + port_kevent_t *pkevp = NULL; + + ASSERT(pnp != NULL); /* * This timer is programmed to use event port notification when @@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) */ it->it_flags |= IT_PORT; - port = tim_pnevp.portnfy_port; + port = pnp->portnfy_port; /* associate timer as event source with the port */ error = port_associate_ksource(port, PORT_SOURCE_TIMER, (port_source_t **)&it->it_portsrc, timer_close_port, (void *)it, NULL); if (error) { - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* allocate an event structure/slot */ @@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) if (error) { (void) port_dissociate_ksource(port, PORT_SOURCE_TIMER, (port_source_t *)it->it_portsrc); - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* initialize event data */ - port_init_event(pkevp, i, tim_pnevp.portnfy_user, + port_init_event(pkevp, tid, pnp->portnfy_user, timer_port_callback, it); it->it_portev = pkevp; it->it_portfd = port; } else { - if (ev.sigev_notify == SIGEV_SIGNAL) + if (evp->sigev_notify == SIGEV_SIGNAL) it->it_flags |= IT_SIGNAL; } + /* Populate the slot now that the timer is prepped. */ + p->p_itimer[tid] = it; mutex_exit(&p->p_lock); /* @@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_lwp = ttolwp(curthread); it->it_proc = p; - if (copyout(&i, tid, sizeof (timer_t)) != 0) { - error = EFAULT; - goto err; - } - - /* - * If we're here, then we have successfully created the timer; we - * just need to release the timer and return. - */ - timer_release(p, it); - + *itp = it; + *tidp = tid; return (0); err: @@ -708,11 +730,115 @@ err: * impossible for a removal to be pending. */ ASSERT(!(it->it_lock & ITLK_REMOVE)); - timer_delete_grabbed(p, i, it); + timer_delete_grabbed(p, tid, it); - return (set_errno(error)); + return (error); } + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ + int error = 0; + proc_t *p = curproc; + clock_backend_t *backend; + struct sigevent ev; + itimer_t *it; + timer_t tid; + port_notify_t tim_pnevp; + + if ((backend = CLOCK_BACKEND(clock)) == NULL) + return (set_errno(EINVAL)); + + if (evp != NULL) { + /* + * short copyin() for binary compatibility + * fetch oldsigevent to determine how much to copy in. + */ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(evp, &ev, sizeof (struct oldsigevent))) + return (set_errno(EFAULT)); + + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, + sizeof (port_notify_t))) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + } else { + struct sigevent32 ev32; + port_notify32_t tim_pnevp32; + + if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) + return (set_errno(EFAULT)); + ev.sigev_notify = ev32.sigev_notify; + ev.sigev_signo = ev32.sigev_signo; + /* + * See comment in sigqueue32() on handling of 32-bit + * sigvals in a 64-bit kernel. + */ + ev.sigev_value.sival_int = ev32.sigev_value.sival_int; + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin((void *)(uintptr_t) + ev32.sigev_value.sival_ptr, + (void *)&tim_pnevp32, + sizeof (port_notify32_t))) + return (set_errno(EFAULT)); + tim_pnevp.portnfy_port = + tim_pnevp32.portnfy_port; + tim_pnevp.portnfy_user = + (void *)(uintptr_t)tim_pnevp32.portnfy_user; + } +#endif + } + switch (ev.sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) + return (set_errno(EINVAL)); + break; + case SIGEV_THREAD: + case SIGEV_PORT: + break; + default: + return (set_errno(EINVAL)); + } + } else { + /* + * Use the clock's default sigevent (this is a structure copy). + */ + ev = backend->clk_default; + } + + if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * Populate si_value with the timer ID if no sigevent was passed in. + */ + if (evp == NULL) { + it->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + timer_delete_grabbed(p, tid, it); + return (set_errno(EFAULT)); + } + + /* + * If we're here, then we have successfully created the timer; we + * just need to release the timer and return. + */ + timer_release(p, it); + + return (0); +} + + int timer_gettime(timer_t tid, itimerspec_t *val) { @@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid) void timer_lwpexit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) { continue; + } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -876,20 +1005,22 @@ timer_lwpexit(void) void timer_lwpbind() { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) continue; + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -911,16 +1042,19 @@ timer_lwpbind() void timer_exit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; ASSERT(p->p_itimer != NULL); + ASSERT(p->p_itimer_sz != 0); - for (i = 0; i < timer_max; i++) - (void) timer_delete(i); + for (i = 0; i < p->p_itimer_sz; i++) { + (void) timer_delete((timer_t)i); + } - kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *)); + kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *)); p->p_itimer = NULL; + p->p_itimer_sz = 0; } /* @@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose) for (tid = 0; tid < timer_max; tid++) { if ((it = timer_grab(p, tid)) == NULL) continue; - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portfd == port) { port_kevent_t *pev; diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 61acc6cf97..53be806026 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* @@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv) void hrt2ts(hrtime_t hrt, timestruc_t *tsp) { +#if defined(__amd64) + /* + * The cleverness explained above is unecessary on x86_64 CPUs where + * modern compilers are able to optimize down to faster operations. + */ + tsp->tv_sec = hrt / NANOSEC; + tsp->tv_nsec = hrt % NANOSEC; +#else uint32_t sec, nsec, tmp; tmp = (uint32_t)(hrt >> 30); @@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp) } tsp->tv_sec = (time_t)sec; tsp->tv_nsec = nsec; +#endif /* defined(__amd64) */ } /* * Convert from timestruc_t to hrtime_t. - * - * The code below is equivalent to: - * - * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; - * - * but requires no integer multiply. */ hrtime_t ts2hrt(const timestruc_t *tsp) { +#if defined(__amd64) || defined(__i386) + /* + * On modern x86 CPUs, the simple version is faster. + */ + return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); +#else + /* + * The code below is equivalent to: + * + * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; + * + * but requires no integer multiply. + */ hrtime_t hrt; hrt = tsp->tv_sec; @@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp) hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 9) + tsp->tv_nsec; return (hrt); +#endif /* defined(__amd64) || defined(__i386) */ } /* @@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp) void hrt2tv(hrtime_t hrt, struct timeval *tvp) { +#if defined(__amd64) + /* + * Like hrt2ts, the simple version is faster on x86_64. + */ + tvp->tv_sec = hrt / NANOSEC; + tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC); +#else uint32_t sec, nsec, tmp; uint32_t q, r, t; @@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp) sec++; } tvp->tv_sec = (time_t)sec; -/* - * this routine is very similar to hr2ts, but requires microseconds - * instead of nanoseconds, so an interger divide by 1000 routine - * completes the conversion - */ + /* + * this routine is very similar to hr2ts, but requires microseconds + * instead of nanoseconds, so an interger divide by 1000 routine + * completes the conversion + */ t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12); q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14); q = q >> 9; r = nsec - q*1000; tvp->tv_usec = q + ((r + 24) >> 10); - +#endif /* defined(__amd64) */ } int diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index b79b1b4cf7..e3da4df247 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1621,7 +1621,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 5a5dc7d107..a8993524ac 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -251,6 +251,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -371,8 +373,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -418,8 +424,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1378,6 +1385,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1704,6 +1819,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1797,6 +1945,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1825,7 +1987,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1850,6 +2012,160 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) @@ -1860,11 +2176,19 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) return (EACCES); + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; return (0); } @@ -1892,12 +2216,22 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); ksp->ks_update = zone_mcap_kstat_update; ksp->ks_private = zone; @@ -1935,6 +2269,8 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; @@ -1978,6 +2314,8 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); @@ -1993,13 +2331,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2031,8 +2381,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2070,6 +2427,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -2094,8 +2453,9 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - + zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; @@ -2206,6 +2566,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2247,6 +2622,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2257,6 +2646,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2278,6 +2672,8 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2357,6 +2753,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2385,6 +2783,19 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) @@ -2429,12 +2840,18 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2442,7 +2859,7 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG @@ -2520,9 +2937,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2594,14 +3016,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - uint64_t mcap; - int err = 0; + zone->zone_mcap_nover++; + + return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ + uint64_t pageout; + int err; + + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -3013,6 +3486,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3756,6 +4235,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3784,9 +4274,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -4272,8 +4807,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4392,7 +4928,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4464,6 +5000,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4471,6 +5008,8 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4538,10 +5077,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4696,8 +5239,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4836,6 +5379,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4846,7 +5390,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5597,14 +6150,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5666,6 +6211,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5697,10 +6259,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5717,7 +6280,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5739,12 +6304,21 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); + break; case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5772,6 +6346,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6475,6 +7065,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6485,7 +7076,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6586,6 +7177,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6763,7 +7355,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6826,16 +7418,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6903,7 +7494,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6940,6 +7532,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7164,6 +7768,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) |