diff options
Diffstat (limited to 'usr/src/uts/common/os')
45 files changed, 3465 insertions, 1037 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -47,6 +48,7 @@ #include <sys/time.h> #include <sys/msacct.h> #include <sys/zone.h> +#include <sys/brand.h> /* * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t) * On exit, write a record on the accounting file. */ void -acct(char st) +acct(int st) { struct vnode *vp; struct cred *cr; @@ -402,6 +404,21 @@ acct(char st) * This only gets called from exit after all lwp's have exited so no * cred locking is needed. */ + + /* If there is a brand-specific hook, use it instead */ + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { + ZBROP(curzone)->b_acct_out(vp, st); + mutex_exit(&ag->aclock); + return; + } + + /* + * The 'st' status value was traditionally masked this way by our + * caller, but we now accept the unmasked value for brand handling. + * Zones not using the brand hook mask the status here. + */ + st &= 0xff; + p = curproc; ua = PTOU(p); bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 0af67f5d98..62c3bbe2d6 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,15 +672,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp) int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, + char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. */ #include <sys/timer.h> @@ -41,6 +41,9 @@ static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; + /*ARGSUSED*/ static int clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts) static int clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { - /* - * CLOCK_HIGHRES timers of sufficiently high resolution can deny - * service; only allow privileged users to create such timers. - * Sites that do not wish to have this restriction should - * give users the "proc_clock_highres" privilege. - */ - if (secpolicy_clock_highres(CRED()) != 0) { - it->it_arg = NULL; - return (EPERM); - } - it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP); it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags, cpu_t *cpu; cpupart_t *cpupart; int pset; + boolean_t value_need_clamp = B_FALSE; + boolean_t intval_need_clamp = B_FALSE; + cred_t *cr = CRED(); + struct itimerspec clamped; + + /* + * CLOCK_HIGHRES timers of sufficiently high resolution can deny + * service; only allow privileged users to create such timers. + * Non-privileged users (those without the "proc_clock_highres" + * privilege) can create timers with lower resolution but if they + * attempt to use a very low time value (< 200us) then their + * timer will be clamped at 200us. + */ + if (when->it_value.tv_sec == 0 && + when->it_value.tv_nsec > 0 && + when->it_value.tv_nsec < clock_highres_interval_min) + value_need_clamp = B_TRUE; + + if (when->it_interval.tv_sec == 0 && + when->it_interval.tv_nsec > 0 && + when->it_interval.tv_nsec < clock_highres_interval_min) + intval_need_clamp = B_TRUE; + + if ((value_need_clamp || intval_need_clamp) && + secpolicy_clock_highres(cr) != 0) { + clamped.it_value.tv_sec = when->it_value.tv_sec; + clamped.it_interval.tv_sec = when->it_interval.tv_sec; + + if (value_need_clamp) { + clamped.it_value.tv_nsec = clock_highres_interval_min; + } else { + clamped.it_value.tv_nsec = when->it_value.tv_nsec; + } + + if (intval_need_clamp) { + clamped.it_interval.tv_nsec = + clock_highres_interval_min; + } else { + clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; + } + + when = &clamped; + } cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 909a6c2860..1a3502a710 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..437f26e6e0 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type) /* * Determine what rootvp to use. */ + mutex_enter(&curproc->p_lock); if (core_type == CORE_PROC) { rootvp = (PTOU(curproc)->u_rdir == NULL ? curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type) VN_HOLD(startvp); if (rootvp != rootdir) VN_HOLD(rootvp); + mutex_exit(&curproc->p_lock); if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp, startvp, CRED())) != 0) { pn_free(&pn); diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 87c0896814..4648dae9dd 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -108,7 +109,8 @@ kmutex_t cpu_lock; cpu_t *cpu_list; /* list of all CPUs */ cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ -static cpuset_t cpu_available; /* set of available CPUs */ +cpuset_t cpu_active_set; /* cached set of active CPUs */ +cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */ @@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp) /* * Set affinity for a specified CPU. - * A reference count is incremented and the affinity is held until the - * reference count is decremented to zero by thread_affinity_clear(). - * This is so regions of code requiring affinity can be nested. - * Caller needs to ensure that cpu_id remains valid, which can be - * done by holding cpu_lock across this call, unless the caller - * specifies CPU_CURRENT in which case the cpu_lock will be acquired - * by thread_affinity_set and CPU->cpu_id will be the target CPU. + * + * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for + * curthread, will set affinity to the CPU on which the thread is currently + * running. For other cpu_id values, the caller must ensure that the + * referenced CPU remains valid, which can be done by holding cpu_lock across + * this call. + * + * CPU affinity is guaranteed after return of thread_affinity_set(). If a + * caller setting affinity to CPU_CURRENT requires that its thread not migrate + * CPUs prior to a successful return, it should take extra precautions (such as + * their own call to kpreempt_disable) to ensure that safety. + * + * CPU_BEST can be used to pick a "best" CPU to migrate to, including + * potentially the current CPU. + * + * A CPU affinity reference count is maintained by thread_affinity_set and + * thread_affinity_clear (incrementing and decrementing it, respectively), + * maintaining CPU affinity while the count is non-zero, and allowing regions + * of code which require affinity to be nested. */ void thread_affinity_set(kthread_id_t t, int cpu_id) { - cpu_t *cp; - int c; + cpu_t *cp; ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL)); - if ((c = cpu_id) == CPU_CURRENT) { - mutex_enter(&cpu_lock); - cpu_id = CPU->cpu_id; + if (cpu_id == CPU_CURRENT) { + VERIFY3P(t, ==, curthread); + kpreempt_disable(); + cp = CPU; + } else if (cpu_id == CPU_BEST) { + VERIFY3P(t, ==, curthread); + kpreempt_disable(); + cp = disp_choose_best_cpu(); + } else { + /* + * We should be asserting that cpu_lock is held here, but + * the NCA code doesn't acquire it. The following assert + * should be uncommented when the NCA code is fixed. + * + * ASSERT(MUTEX_HELD(&cpu_lock)); + */ + VERIFY((cpu_id >= 0) && (cpu_id < NCPU)); + cp = cpu[cpu_id]; + + /* user must provide a good cpu_id */ + VERIFY(cp != NULL); } - /* - * We should be asserting that cpu_lock is held here, but - * the NCA code doesn't acquire it. The following assert - * should be uncommented when the NCA code is fixed. - * - * ASSERT(MUTEX_HELD(&cpu_lock)); - */ - ASSERT((cpu_id >= 0) && (cpu_id < NCPU)); - cp = cpu[cpu_id]; - ASSERT(cp != NULL); /* user must provide a good cpu_id */ + /* * If there is already a hard affinity requested, and this affinity * conflicts with that, panic. @@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id) * Make sure we're running on the right CPU. */ if (cp != t->t_cpu || t != curthread) { + ASSERT(cpu_id != CPU_CURRENT); force_thread_migrate(t); /* drops thread lock */ } else { thread_unlock(t); } - if (c == CPU_CURRENT) - mutex_exit(&cpu_lock); + if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST) + kpreempt_enable(); } /* @@ -1473,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_bound_cpu != cp) - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); @@ -1516,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ - if (t->t_cpu == cp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); - } + if (t->t_cpu == cp && t->t_bound_cpu != cp) + t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri); + ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); t = t->t_next; @@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp) cp->cpu_part = &cp_default; CPUSET_ADD(cpu_available, cp->cpu_id); + CPUSET_ADD(cpu_active_set, cp->cpu_id); } /* @@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp) cp->cpu_prev_onln = cpu_active->cpu_prev_onln; cpu_active->cpu_prev_onln->cpu_next_onln = cp; cpu_active->cpu_prev_onln = cp; + CPUSET_ADD(cpu_active_set, cp->cpu_id); if (pp->cp_cpulist) { cp->cpu_next_part = pp->cp_cpulist; @@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp) } cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; + CPUSET_DEL(cpu_active_set, cp->cpu_id); cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; @@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind, return (0); } -#if CPUSET_WORDS > 1 -/* - * Functions for implementing cpuset operations when a cpuset is more - * than one word. On platforms where a cpuset is a single word these - * are implemented as macros in cpuvar.h. - */ +cpuset_t * +cpuset_alloc(int kmflags) +{ + return (kmem_alloc(sizeof (cpuset_t), kmflags)); +} + +void +cpuset_free(cpuset_t *s) +{ + kmem_free(s, sizeof (cpuset_t)); +} void cpuset_all(cpuset_t *s) @@ -2722,38 +2752,61 @@ cpuset_all(cpuset_t *s) } void -cpuset_all_but(cpuset_t *s, uint_t cpu) +cpuset_all_but(cpuset_t *s, const uint_t cpu) { cpuset_all(s); CPUSET_DEL(*s, cpu); } void -cpuset_only(cpuset_t *s, uint_t cpu) +cpuset_only(cpuset_t *s, const uint_t cpu) { CPUSET_ZERO(*s); CPUSET_ADD(*s, cpu); } +long +cpu_in_set(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + return (BT_TEST(s->cpub, cpu)); +} + +void +cpuset_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_SET(s->cpub, cpu); +} + +void +cpuset_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_CLEAR(s->cpub, cpu); +} + int cpuset_isnull(cpuset_t *s) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s->cpub[i] != 0) return (0); + } return (1); } int -cpuset_cmp(cpuset_t *s1, cpuset_t *s2) +cpuset_isequal(cpuset_t *s1, cpuset_t *s2) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s1->cpub[i] != s2->cpub[i]) return (0); + } return (1); } @@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid) *smallestid = *largestid = CPUSET_NOTINSET; } -#endif /* CPUSET_WORDS */ +void +cpuset_atomic_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR(s->cpub, (cpu)) +} + +void +cpuset_atomic_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_SET(s->cpub, (cpu)) +} + +long +cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_SET_EXCL(s->cpub, cpu, res); + return (res); +} + +long +cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res); + return (res); +} + +void +cpuset_or(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] |= src->cpub[i]; + } +} + +void +cpuset_xor(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] ^= src->cpub[i]; + } +} + +void +cpuset_and(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] &= src->cpub[i]; + } +} + +void +cpuset_zero(cpuset_t *dst) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] = 0; + } +} + /* * Unbind threads bound to specified CPU. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 25727d54c5..0bd6cfd44f 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c index 21907b4957..45e13ebeab 100644 --- a/usr/src/uts/common/os/cyclic.c +++ b/usr/src/uts/common/os/cyclic.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2018 Joyent Inc. */ /* @@ -112,6 +112,7 @@ * cyclic_remove() <-- Removes a cyclic * cyclic_bind() <-- Change a cyclic's CPU or partition binding * cyclic_reprogram() <-- Reprogram a cyclic's expiration + * cyclic_move_here() <-- Shuffle cyclic to current CPU * * Inter-subsystem Interfaces * @@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration) return (1); } +/* + * void cyclic_move_here(cyclic_id_t) + * + * Overview + * + * cyclic_move_here() attempts to shuffle a cyclic onto the current CPU. + * + * Arguments and notes + * + * The first argument is a cyclic_id returned from cyclic_add(). + * cyclic_move_here() may _not_ be called on a cyclic_id returned from + * cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind(). + * + * This cyclic shuffling is performed on a best-effort basis. If for some + * reason the current CPU is unsuitable or the thread migrates between CPUs + * during the call, the function may return with the cyclic residing on some + * other CPU. + * + * Return value + * + * None; cyclic_move_here() always reports success. + * + * Caller's context + * + * cpu_lock must be held by the caller, and the caller must not be in + * interrupt context. The caller may not hold any locks which are also + * grabbed by any cyclic handler. + */ +void +cyclic_move_here(cyclic_id_t id) +{ + cyc_id_t *idp = (cyc_id_t *)id; + cyc_cpu_t *cc = idp->cyi_cpu; + cpu_t *dest = CPU; + + ASSERT(MUTEX_HELD(&cpu_lock)); + CYC_PTRACE("move_here", idp, dest); + VERIFY3P(cc, !=, NULL); + VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags & + (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0); + + if (cc->cyp_cpu == dest) { + return; + } + + /* Is the destination CPU suitable for a migration target? */ + if (dest->cpu_cyclic == NULL || + dest->cpu_cyclic->cyp_state == CYS_OFFLINE || + (dest->cpu_flags & CPU_ENABLE) == 0) { + return; + } + + cyclic_juggle_one_to(idp, dest->cpu_cyclic); +} + hrtime_t cyclic_getres() { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 53c552f135..96b6081489 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * These are consumed within the specific exec modules, but are defined here @@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp, * only if the pathname does not contain a "/" the resolved path * points to a file in the current working (attribute) directory. */ - if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && + mutex_enter(&p->p_lock); + if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 && strchr(resolvepn.pn_path, '/') == NULL) { + mutex_exit(&p->p_lock); if (dir != NULL) VN_RELE(dir); error = EACCES; @@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp, VN_RELE(vp); goto out; } + mutex_exit(&p->p_lock); bzero(exec_file, MAXCOMLEN+1); (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -569,7 +606,7 @@ gexec( long *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -890,8 +927,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 1b9359da47..06e0117cd6 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -138,6 +138,27 @@ rexit(int rval) } /* + * Bump the init_restarts kstat and let interested parties know about the + * restart. + */ +static void +restart_init_notify(zone_t *zone) +{ + nvlist_t *nvl = NULL; + + zone->zone_proc_init_restarts++; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 && + nvlist_add_uint32(nvl, ZONE_CB_RESTARTS, + zone->zone_proc_init_restarts) == 0) { + zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS, + ZONE_EVENT_INIT_RESTART_SC, nvl); + } + + nvlist_free(nvl); +} + +/* * Called by proc_exit() when a zone's init exits, presumably because * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things @@ -230,7 +251,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -260,6 +281,8 @@ restart_init(int what, int why) ASSERT(p == curproc); (void) freectty(B_TRUE); + restart_init_notify(p->p_zone); + /* * Now exec() the new init(1M) on top of the current process. If we * succeed, the caller will treat this like a successful system call. @@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p) } /* + * Return true if zone's init is restarted, false if exit processing should + * proceeed. + */ +static boolean_t +zone_init_exit(zone_t *z, int why, int what) +{ + /* + * Typically we don't let the zone's init exit unless zone_start_init() + * failed its exec, or we are shutting down the zone or the machine, + * although the various flags handled within this function will control + * the behavior. + * + * Since we are single threaded, we don't need to lock the following + * accesses to zone_proc_initpid. + */ + if (z->zone_boot_err != 0 || + zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN || + zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + /* + * Clear the zone's init pid and proceed with exit processing. + */ + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + /* + * There are a variety of configuration flags on the zone to control + * init exit behavior. + * + * If the init process should be restarted, the "zone_restart_init" + * member will be set. + */ + if (!z->zone_restart_init) { + /* + * The zone has been setup to halt when init exits. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + /* + * At this point we know we're configured to restart init, but there + * are various modifiers to that behavior. + */ + + if (z->zone_reboot_on_init_exit) { + /* + * Some init programs in branded zones do not tolerate a + * restart in the traditional manner; setting + * "zone_reboot_on_init_exit" will cause the entire zone to be + * rebooted instead. + */ + + if (z->zone_restart_init_0) { + /* + * Some init programs in branded zones only want to + * restart if they exit 0, otherwise the zone should + * shutdown. Setting the "zone_restart_init_0" member + * controls this behavior. + */ + if (why == CLD_EXITED && what == 0) { + /* Trigger a zone reboot */ + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + /* Shutdown instead of reboot */ + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, + zone_kcred()); + } + } else { + /* Trigger a zone reboot */ + (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred()); + } + + z->zone_init_status = wstat(why, what); + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + if (z->zone_restart_init_0) { + /* + * Some init programs in branded zones only want to restart if + * they exit 0, otherwise the zone should shutdown. Setting the + * "zone_restart_init_0" member controls this behavior. + * + * In this case we only restart init if it exited successfully. + */ + if (why == CLD_EXITED && what == 0 && + restart_init(what, why) == 0) { + return (B_TRUE); + } + } else { + /* + * No restart modifiers on the zone, attempt to restart init. + */ + if (restart_init(what, why) == 0) { + return (B_TRUE); + } + } + + + /* + * The restart failed, the zone will shut down. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); + z->zone_proc_initpid = -1; + return (B_FALSE); +} + +/* * Return value: * 1 - exitlwps() failed, call (or continue) lwp_exit() * 0 - restarting init. Return through system call path @@ -366,45 +502,36 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); + if (p->p_pid == z->zone_proc_initpid) { + /* If zone's init restarts, we're done here. */ + if (zone_init_exit(z, why, what)) + return (0); + } /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); /* - * Don't let init exit unless zone_start_init() failed its exec, or - * we are shutting down the zone or the machine. - * - * Since we are single threaded, we don't need to lock the - * following accesses to zone_proc_initpid. + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. */ - if (p->p_pid == z->zone_proc_initpid) { - if (z->zone_boot_err == 0 && - zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && - zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); - } else { - (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); - } - } - + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); /* - * Since we didn't or couldn't restart init, we clear - * the zone's init state and proceed with exit - * processing. + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. */ - z->zone_proc_initpid = -1; + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); } lwp_pcb_exit(); @@ -565,7 +692,7 @@ proc_exit(int why, int what) semexit(p); rv = wstat(why, what); - acct(rv & 0xff); + acct(rv); exacct_commit_proc(p, rv); /* @@ -658,10 +785,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -673,7 +812,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -847,8 +987,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; + + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } - proc_gone = 0; + if (pp->p_child == NULL) { + goto no_real_children; + } + } + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1445,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..41e7e63d2b 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -386,6 +386,7 @@ flist_grow(int maxfd) dst->uf_flag = src->uf_flag; dst->uf_busy = src->uf_busy; dst->uf_portfd = src->uf_portfd; + dst->uf_gen = src->uf_gen; } /* @@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */ afd->a_fd[i] = -1; } -static void +void set_active_fd(int fd) { afd_t *afd = &curthread->t_activefd; @@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd) } /* - * Convert a user supplied file descriptor into a pointer to a file - * structure. Only task is to check range of the descriptor (soft - * resource limit was enforced at open time and shouldn't be checked - * here). + * Convert a user supplied file descriptor into a pointer to a file structure. + * Only task is to check range of the descriptor (soft resource limit was + * enforced at open time and shouldn't be checked here). */ file_t * -getf(int fd) +getf_gen(int fd, uf_entry_gen_t *genp) { uf_info_t *fip = P_FINFO(curproc); uf_entry_t *ufp; @@ -607,6 +607,9 @@ getf(int fd) return (NULL); } ufp->uf_refcnt++; + if (genp != NULL) { + *genp = ufp->uf_gen; + } set_active_fd(fd); /* record the active file descriptor */ @@ -615,6 +618,12 @@ getf(int fd) return (fp); } +file_t * +getf(int fd) +{ + return (getf_gen(fd, NULL)); +} + /* * Close whatever file currently occupies the file descriptor slot * and install the new file, usually NULL, in the file descriptor slot. @@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp) ASSERT(ufp->uf_flag == 0); fd_reserve(fip, fd, 1); ufp->uf_file = newfp; + ufp->uf_gen++; UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (0); @@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { @@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) cufp->uf_alloc = pufp->uf_alloc; cufp->uf_flag = pufp->uf_flag; cufp->uf_busy = pufp->uf_busy; + cufp->uf_gen = pufp->uf_gen; if (pufp->uf_file == NULL) { ASSERT(pufp->uf_flag == 0); if (pufp->uf_busy) { @@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp) fd_reserve(fip, fd, 1); ASSERT(ufp->uf_file == NULL); ufp->uf_file = fp; + if (fp != NULL) { + ufp->uf_gen++; + } UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (fd); @@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp) } else { UF_ENTER(ufp, fip, fd); ASSERT(ufp->uf_busy); + ufp->uf_gen++; } ASSERT(ufp->uf_fpollinfo == NULL); ASSERT(ufp->uf_flag == 0); @@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp) error = EBADF; else { vnode_t *vp = fp->f_vnode; - int flag = fp->f_flag | - ((fp->f_flag2 & ~FEPOLLED) << 16); + int flag = fp->f_flag | (fp->f_flag2 << 16); /* * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index a63931459f..7e198910b4 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -705,7 +706,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -754,7 +755,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -791,6 +792,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); @@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) */ fcnt_add(P_FINFO(pp), 1); + mutex_enter(&pp->p_lock); if (PTOU(pp)->u_cdir) { VN_HOLD(PTOU(pp)->u_cdir); } else { @@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd) refstr_hold(PTOU(pp)->u_cwd); + mutex_exit(&pp->p_lock); /* * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index de2a4f26c4..07fd623a95 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@ /* * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \ !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint)) -static int +int smmap_common(caddr_t *addrp, size_t len, int prot, int flags, struct file *fp, offset_t pos) { @@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 9381019cd1..6a6f5d84ef 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index b41ab8c465..9a3692053d 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. @@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ size_t kmem_content_log_size; /* content log size [2% of memory] */ size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ + #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ #else @@ -1098,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; kmem_log_header_t *kmem_slab_log; +kmem_log_header_t *kmem_zerosized_log; static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -2851,8 +2862,33 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we log to the kmem_zerosized_log that + * that this condition has occurred (which gives us + * enough information to be able to debug it). + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0; " + "see kmem_zerosized_log for details"); + } + + kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -4392,8 +4428,8 @@ kmem_init(void) } kmem_failure_log = kmem_log_init(kmem_failure_log_size); - kmem_slab_log = kmem_log_init(kmem_slab_log_size); + kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size); /* * Initialize STREAMS message caches so allocb() is available. diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 93c04cff8d..b09b2d3558 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -198,6 +198,9 @@ struct { kstat_named_t pagesfree; kstat_named_t pageslocked; kstat_named_t pagestotal; + kstat_named_t lowmemscan; + kstat_named_t zonecapscan; + kstat_named_t nthrottle; } system_pages_kstat = { { "physmem", KSTAT_DATA_ULONG }, { "nalloc", KSTAT_DATA_ULONG }, @@ -219,6 +222,9 @@ struct { { "pagesfree", KSTAT_DATA_ULONG }, { "pageslocked", KSTAT_DATA_ULONG }, { "pagestotal", KSTAT_DATA_ULONG }, + { "low_mem_scan", KSTAT_DATA_ULONG }, + { "zone_cap_scan", KSTAT_DATA_ULONG }, + { "n_throttle", KSTAT_DATA_ULONG }, }; static int header_kstat_update(kstat_t *, int); @@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw) system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial - availrmem); system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages; + system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan; + system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan; + system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle; /* * pp_kernel represents total pages used by the kernel since the * startup. This formula takes into account the boottime kernel diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c index 6288f47bed..6f6aced619 100644 --- a/usr/src/uts/common/os/lgrp.c +++ b/usr/src/uts/common/os/lgrp.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -90,6 +91,7 @@ #include <sys/pg.h> #include <sys/promif.h> #include <sys/sdt.h> +#include <sys/ht.h> lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ @@ -520,6 +522,8 @@ lgrp_main_mp_init(void) { klgrpset_t changed; + ht_init(); + /* * Update lgroup topology (if necessary) */ diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..06c03dd38e 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif @@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc) mblk_t * log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg, - size_t size, int on_intr) + size_t size, int on_intr) { mblk_t *mp = NULL; mblk_t *mp2; diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index b2adae570f..341e4ae356 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + lwp_fp_init(lwp); if (state == TS_RUN) { @@ -755,8 +783,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -893,13 +953,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -943,6 +996,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1103,7 +1168,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index ec61ad5c76..db6d74b2c2 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)exec_fnamep, (const char **)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 3571747e9c..6be46fa422 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp) * Put pressure on pageout. */ page_needfree(free_get); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); mutex_enter(&mhp->mh_mutex); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 142c10754e..0410e6f47b 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c index 35162eb558..c6e9d89d0d 100644 --- a/usr/src/uts/common/os/modctl.c +++ b/usr/src/uts/common/os/modctl.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 Joyent, Inc. */ /* @@ -3469,6 +3470,11 @@ mod_load(struct modctl *mp, int usepath) retval = install_stubs_by_name(mp, mp->mod_modname); /* + * Perform hotinlines before module is started. + */ + do_hotinlines(mp->mod_mp); + + /* * Now that the module is loaded, we need to give DTrace * a chance to notify its providers. This is done via * the dtrace_modload function pointer. diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c index 8dca86880f..37ac089edf 100644 --- a/usr/src/uts/common/os/modsysfile.c +++ b/usr/src/uts/common/os/modsysfile.c @@ -23,6 +23,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -57,10 +58,12 @@ struct hwc_class *hcl_head; /* head of list of classes */ static kmutex_t hcl_lock; /* for accessing list of classes */ #define DAFILE "/etc/driver_aliases" +#define PPTFILE "/etc/ppt_aliases" #define CLASSFILE "/etc/driver_classes" #define DACFFILE "/etc/dacf.conf" static char class_file[] = CLASSFILE; +static char pptfile[] = PPTFILE; static char dafile[] = DAFILE; static char dacffile[] = DACFFILE; @@ -2136,14 +2139,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props) return (0); /* always return success */ } -void -make_aliases(struct bind **bhash) +static void +parse_aliases(struct bind **bhash, struct _buf *file) { enum { AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA } state; - struct _buf *file; char tokbuf[MAXPATHLEN]; char drvbuf[MAXPATHLEN]; token_t token; @@ -2152,9 +2154,6 @@ make_aliases(struct bind **bhash) static char dupwarn[] = "!Driver alias \"%s\" conflicts with " "an existing driver name or alias."; - if ((file = kobj_open_file(dafile)) == (struct _buf *)-1) - return; - state = AL_NEW; major = DDI_MAJOR_T_NONE; while (!done) { @@ -2239,8 +2238,22 @@ make_aliases(struct bind **bhash) kobj_file_err(CE_WARN, file, tok_err, tokbuf); } } +} - kobj_close_file(file); +void +make_aliases(struct bind **bhash) +{ + struct _buf *file; + + if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) { + parse_aliases(bhash, file); + kobj_close_file(file); + } + + if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) { + parse_aliases(bhash, file); + kobj_close_file(file); + } } diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -522,6 +535,20 @@ sprunlock(proc_t *p) THREAD_KPRI_RELEASE(); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + THREAD_KPRI_RELEASE(); +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index d6821c83b0..8cc7f009a3 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -56,6 +56,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bc1787c9ca..854fb602da 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 09b80323d5..e0a1126567 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids; kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */ kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; + kmutex_t rctl_lists_lock; rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2872,12 +2875,12 @@ rctl_init(void) * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, * int chargeproc) * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects. * * Return values * 0 - success @@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(p != NULL); ASSERT(MUTEX_HELD(&p->p_lock)); + if (proj != NULL) { projp = proj; zonep = proj->kpj_zone; @@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, } } - zonep->zone_locked_mem += inc; - projp->kpj_data.kpd_locked_mem += inc; if (chargeproc != 0) { + /* Check for overflow */ + if ((p->p_locked_mem + inc) < p->p_locked_mem) { + ret = EAGAIN; + goto out; + } + if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, + &e, inc, 0) & RCT_DENY) { + ret = EAGAIN; + goto out; + } + p->p_locked_mem += inc; } + + zonep->zone_locked_mem += inc; + projp->kpj_data.kpd_locked_mem += inc; out: mutex_exit(&zonep->zone_mem_lock); return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/port_kernel.h> #include <sys/signal.h> #include <sys/var.h> +#include <sys/policy.h> #include <sys/vmparam.h> #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl; rctl_hndl_t rc_process_semopm; rctl_hndl_t rc_process_portev; rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem; /* * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = { }; /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, + struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ + if (secpolicy_lock_memory(CRED()) == 0) + return (0); + return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { + rcop_no_action, + rcop_no_usage, + rcop_no_set, + proc_maxlockedmem_test +}; + +/* * void rctlproc_default_init() * * Overview @@ -383,6 +406,11 @@ rctlproc_init(void) rctl_add_default_limit("process.max-sigqueue-size", _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); + rc_process_maxlockedmem = rctl_register("process.max-locked-memory", + RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, + ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); + /* * Place minimal set of controls on "sched" process for inheritance by * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 5721083751..18b396a765 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t) /* - * If the sc_sigblock field is set for the specified thread, set - * its signal mask to block all maskable signals, then clear the - * sc_sigblock field. This finishes what user-level code requested - * to be done when it set tdp->sc_shared->sc_sigblock non-zero. - * Called from signal-related code either by the current thread for - * itself or by a thread that holds the process's p_lock (/proc code). + * If the sc_sigblock field is set for the specified thread, set its signal + * mask to block all maskable signals, then clear the sc_sigblock field. This + * accomplishes what user-level code requested to be done when it set + * tdp->sc_shared->sc_sigblock non-zero. + * + * This is generally called by signal-related code in the current thread. In + * order to call against a thread other than curthread, p_lock for the + * containing process must be held. Even then, the caller is not protected + * from races with the thread in question updating its own fields. It is the + * responsibility of the caller to perform additional synchronization. + * */ void schedctl_finish_sigblock(kthread_t *t) diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..67a93581dd 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,28 @@ issig_forreal(void) } /* + * The brand hook name 'b_issig_stop' is a misnomer. + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + int r; + + /* + * The brand hook will return 0 if it would like + * us to drive on, -1 if we should restart + * the loop to check other conditions, or 1 if we + * should terminate the loop. + */ + r = BROP(p)->b_issig_stop(p, lwp); + if (r < 0) { + continue; + } else if (r > 0) { + break; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +695,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +747,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +761,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +993,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1114,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1220,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1248,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1383,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1863,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 1786769cfb..1f9ceee188 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -78,6 +78,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3179,6 +3216,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 8cc27df4eb..959e5576f0 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -26,6 +26,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -8461,6 +8462,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src) dbp->db_cpid = cpid; } + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */ int hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, uint32_t start, uint32_t stuff, uint32_t end, uint32_t value, diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index ede7da413b..b1727729de 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5903,6 +5903,12 @@ ddi_ffs(long mask) return (ffs(mask)); } +int +ddi_ffsll(long long mask) +{ + return (ffs(mask)); +} + /* * Find last bit set. Take mask and clear * all but the most significant bit, and @@ -5914,8 +5920,14 @@ ddi_ffs(long mask) int ddi_fls(long mask) { + return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{ while (mask) { - long nx; + long long nx; if ((nx = (mask & (mask - 1))) == 0) break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index f1b6f2616c..554ba1b881 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,6 +23,7 @@ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2018, Joyent, Inc. */ @@ -61,8 +62,7 @@ struct mmaplf32a; int access(char *, int); int alarm(int); int auditsys(struct auditcalls *, rval_t *); -int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t); +int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); intptr_t brk(caddr_t); int chdir(char *); int chmod(char *, int); @@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1094,18 +1094,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index b25a6cbcf1..5453ebf380 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,11 +25,12 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/timer.h> #include <sys/systm.h> +#include <sys/sysmacros.h> #include <sys/param.h> #include <sys/kmem.h> #include <sys/debug.h> @@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it) * waiters. p_lock must be held on entry; it will not be dropped by * timer_unlock(). */ +/* ARGSUSED */ static void timer_unlock(proc_t *p, itimer_t *it) { @@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) timer_lock(p, it); } + ASSERT(p->p_itimer_sz > tid); ASSERT(p->p_itimer[tid] == it); p->p_itimer[tid] = NULL; @@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) it->it_backend->clk_timer_delete(it); - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portev) { port_kevent_t *pev; @@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) static itimer_t * timer_grab(proc_t *p, timer_t tid) { - itimer_t **itp, *it; + itimer_t *it; - if (tid >= timer_max || tid < 0) + if (tid < 0) { return (NULL); + } mutex_enter(&p->p_lock); - - if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) { + if (p->p_itimer == NULL || tid >= p->p_itimer_sz || + (it = p->p_itimer[tid]) == NULL) { mutex_exit(&p->p_lock); return (NULL); } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (it->it_lock & ITLK_REMOVE) { @@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid) * should not be held on entry; timer_release() will acquire p_lock but * will drop it before returning. */ -static void +void timer_release(proc_t *p, itimer_t *it) { mutex_enter(&p->p_lock); @@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it) * p_lock should not be held on entry; timer_delete_grabbed() will acquire * p_lock, but will drop it before returning. */ -static void +void timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it) { mutex_enter(&p->p_lock); @@ -258,6 +263,13 @@ clock_timer_init() { clock_timer_cache = kmem_cache_create("timer_cache", sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Push the timer_max limit up to at least 4 * NCPU. Due to the way + * NCPU is defined, proper initialization of the timer limit is + * performed at runtime. + */ + timer_max = MAX(NCPU * 4, timer_max); } void @@ -453,6 +465,9 @@ timer_fire(itimer_t *it) it->it_pending = 1; port_send_event((port_kevent_t *)it->it_portev); mutex_exit(&it->it_mutex); + } else if (it->it_flags & IT_CALLBACK) { + it->it_cb_func(it); + ASSERT(MUTEX_NOT_HELD(&it->it_mutex)); } else if (it->it_flags & IT_SIGNAL) { it->it_pending = 1; mutex_exit(&it->it_mutex); @@ -466,159 +481,175 @@ timer_fire(itimer_t *it) mutex_exit(&p->p_lock); } -int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +/* + * Allocate an itimer_t and find and appropriate slot for it in p_itimer. + * Acquires p_lock and holds it on return, regardless of success. + */ +static itimer_t * +timer_alloc(proc_t *p, timer_t *id) { - struct sigevent ev; - proc_t *p = curproc; - clock_backend_t *backend; - itimer_t *it, **itp; - sigqueue_t *sigq; - cred_t *cr = CRED(); - int error = 0; - timer_t i; - port_notify_t tim_pnevp; - port_kevent_t *pkevp = NULL; + itimer_t *it, **itp = NULL; + uint_t i; - if ((backend = CLOCK_BACKEND(clock)) == NULL) - return (set_errno(EINVAL)); + ASSERT(MUTEX_NOT_HELD(&p->p_lock)); - if (evp != NULL) { - /* - * short copyin() for binary compatibility - * fetch oldsigevent to determine how much to copy in. - */ - if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(evp, &ev, sizeof (struct oldsigevent))) - return (set_errno(EFAULT)); + it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); + bzero(it, sizeof (itimer_t)); + mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, - sizeof (port_notify_t))) - return (set_errno(EFAULT)); + mutex_enter(&p->p_lock); +retry: + if (p->p_itimer != NULL) { + for (i = 0; i < p->p_itimer_sz; i++) { + if (p->p_itimer[i] == NULL) { + itp = &(p->p_itimer[i]); + break; } -#ifdef _SYSCALL32_IMPL - } else { - struct sigevent32 ev32; - port_notify32_t tim_pnevp32; + } + } - if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) - return (set_errno(EFAULT)); - ev.sigev_notify = ev32.sigev_notify; - ev.sigev_signo = ev32.sigev_signo; + /* + * A suitable slot was not found. If possible, allocate (or resize) + * the p_itimer array and try again. + */ + if (itp == NULL) { + uint_t target_sz = _TIMER_ALLOC_INIT; + itimer_t **itp_new; + + if (p->p_itimer != NULL) { + ASSERT(p->p_itimer_sz != 0); + + target_sz = p->p_itimer_sz * 2; + } + /* + * Protect against exceeding the max or overflow + */ + if (target_sz > timer_max || target_sz > INT_MAX || + target_sz < p->p_itimer_sz) { + kmem_cache_free(clock_timer_cache, it); + return (NULL); + } + mutex_exit(&p->p_lock); + itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), + KM_SLEEP); + mutex_enter(&p->p_lock); + if (target_sz <= p->p_itimer_sz) { /* - * See comment in sigqueue32() on handling of 32-bit - * sigvals in a 64-bit kernel. + * A racing thread performed the resize while we were + * waiting outside p_lock. Discard our now-useless + * allocation and retry. */ - ev.sigev_value.sival_int = ev32.sigev_value.sival_int; - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin((void *)(uintptr_t) - ev32.sigev_value.sival_ptr, - (void *)&tim_pnevp32, - sizeof (port_notify32_t))) - return (set_errno(EFAULT)); - tim_pnevp.portnfy_port = - tim_pnevp32.portnfy_port; - tim_pnevp.portnfy_user = - (void *)(uintptr_t)tim_pnevp32.portnfy_user; + kmem_free(itp_new, target_sz * sizeof (itimer_t *)); + goto retry; + } else { + /* + * Instantiate the larger allocation and select the + * first fresh entry for use. + */ + if (p->p_itimer != NULL) { + uint_t old_sz; + + old_sz = p->p_itimer_sz; + bcopy(p->p_itimer, itp_new, + old_sz * sizeof (itimer_t *)); + kmem_free(p->p_itimer, + old_sz * sizeof (itimer_t *)); + + /* + * Short circuit to use the first free entry in + * the new allocation. It's possible that + * other lower-indexed timers were freed while + * p_lock was dropped, but skipping over them + * is not harmful at all. In the common case, + * we skip the need to walk over an array + * filled with timers before arriving at the + * slot we know is fresh from the allocation. + */ + i = old_sz; + } else { + /* + * For processes lacking any existing timers, + * we can simply select the first entry. + */ + i = 0; } -#endif + p->p_itimer = itp_new; + p->p_itimer_sz = target_sz; } - switch (ev.sigev_notify) { - case SIGEV_NONE: - break; - case SIGEV_SIGNAL: - if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) - return (set_errno(EINVAL)); - break; - case SIGEV_THREAD: - case SIGEV_PORT: - break; - default: - return (set_errno(EINVAL)); - } - } else { - /* - * Use the clock's default sigevent (this is a structure copy). - */ - ev = backend->clk_default; } + ASSERT(i <= INT_MAX); + *id = (timer_t)i; + return (it); +} + +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend. Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete(). This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ +int +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, + itimer_t **itp, timer_t *tidp) +{ + proc_t *p = curproc; + int error = 0; + itimer_t *it; + sigqueue_t *sigq; + timer_t tid; + /* - * We'll allocate our timer and sigqueue now, before we grab p_lock. - * If we can't find an empty slot, we'll free them before returning. + * We'll allocate our sigqueue now, before we grab p_lock. + * If we can't find an empty slot, we'll free it before returning. */ - it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); - bzero(it, sizeof (itimer_t)); - mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); - mutex_enter(&p->p_lock); - /* - * If this is this process' first timer, we need to attempt to allocate - * an array of timerstr_t pointers. We drop p_lock to perform the - * allocation; if we return to discover that p_itimer is non-NULL, - * we will free our allocation and drive on. + * Allocate a timer and choose a slot for it. This acquires p_lock. */ - if ((itp = p->p_itimer) == NULL) { - mutex_exit(&p->p_lock); - itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP); - mutex_enter(&p->p_lock); - - if (p->p_itimer == NULL) - p->p_itimer = itp; - else { - kmem_free(itp, timer_max * sizeof (itimer_t *)); - itp = p->p_itimer; - } - } - - for (i = 0; i < timer_max && itp[i] != NULL; i++) - continue; + it = timer_alloc(p, &tid); + ASSERT(MUTEX_HELD(&p->p_lock)); - if (i == timer_max) { - /* - * We couldn't find a slot. Drop p_lock, free the preallocated - * timer and sigqueue, and return an error. - */ + if (it == NULL) { mutex_exit(&p->p_lock); - kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - - return (set_errno(EAGAIN)); + return (EAGAIN); } - ASSERT(i < timer_max && itp[i] == NULL); + ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); + ASSERT(evp != NULL); /* * If we develop other notification mechanisms, this will need * to call into (yet another) backend. */ - sigq->sq_info.si_signo = ev.sigev_signo; - if (evp == NULL) - sigq->sq_info.si_value.sival_int = i; - else - sigq->sq_info.si_value = ev.sigev_value; + sigq->sq_info.si_signo = evp->sigev_signo; + sigq->sq_info.si_value = evp->sigev_value; sigq->sq_info.si_code = SI_TIMER; sigq->sq_info.si_pid = p->p_pid; sigq->sq_info.si_ctid = PRCTID(p); sigq->sq_info.si_zoneid = getzoneid(); - sigq->sq_info.si_uid = crgetruid(cr); + sigq->sq_info.si_uid = crgetruid(CRED()); sigq->sq_func = timer_signal; sigq->sq_next = NULL; sigq->sq_backptr = it; it->it_sigq = sigq; it->it_backend = backend; it->it_lock = ITLK_LOCKED; - itp[i] = it; - - if (ev.sigev_notify == SIGEV_THREAD || - ev.sigev_notify == SIGEV_PORT) { + if (evp->sigev_notify == SIGEV_THREAD || + evp->sigev_notify == SIGEV_PORT) { int port; + port_kevent_t *pkevp = NULL; + + ASSERT(pnp != NULL); /* * This timer is programmed to use event port notification when @@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) */ it->it_flags |= IT_PORT; - port = tim_pnevp.portnfy_port; + port = pnp->portnfy_port; /* associate timer as event source with the port */ error = port_associate_ksource(port, PORT_SOURCE_TIMER, (port_source_t **)&it->it_portsrc, timer_close_port, (void *)it, NULL); if (error) { - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* allocate an event structure/slot */ @@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) if (error) { (void) port_dissociate_ksource(port, PORT_SOURCE_TIMER, (port_source_t *)it->it_portsrc); - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* initialize event data */ - port_init_event(pkevp, i, tim_pnevp.portnfy_user, + port_init_event(pkevp, tid, pnp->portnfy_user, timer_port_callback, it); it->it_portev = pkevp; it->it_portfd = port; } else { - if (ev.sigev_notify == SIGEV_SIGNAL) + if (evp->sigev_notify == SIGEV_SIGNAL) it->it_flags |= IT_SIGNAL; } + /* Populate the slot now that the timer is prepped. */ + p->p_itimer[tid] = it; mutex_exit(&p->p_lock); /* @@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_lwp = ttolwp(curthread); it->it_proc = p; - if (copyout(&i, tid, sizeof (timer_t)) != 0) { - error = EFAULT; - goto err; - } - - /* - * If we're here, then we have successfully created the timer; we - * just need to release the timer and return. - */ - timer_release(p, it); - + *itp = it; + *tidp = tid; return (0); err: @@ -708,11 +730,115 @@ err: * impossible for a removal to be pending. */ ASSERT(!(it->it_lock & ITLK_REMOVE)); - timer_delete_grabbed(p, i, it); + timer_delete_grabbed(p, tid, it); - return (set_errno(error)); + return (error); } + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ + int error = 0; + proc_t *p = curproc; + clock_backend_t *backend; + struct sigevent ev; + itimer_t *it; + timer_t tid; + port_notify_t tim_pnevp; + + if ((backend = CLOCK_BACKEND(clock)) == NULL) + return (set_errno(EINVAL)); + + if (evp != NULL) { + /* + * short copyin() for binary compatibility + * fetch oldsigevent to determine how much to copy in. + */ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(evp, &ev, sizeof (struct oldsigevent))) + return (set_errno(EFAULT)); + + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, + sizeof (port_notify_t))) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + } else { + struct sigevent32 ev32; + port_notify32_t tim_pnevp32; + + if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) + return (set_errno(EFAULT)); + ev.sigev_notify = ev32.sigev_notify; + ev.sigev_signo = ev32.sigev_signo; + /* + * See comment in sigqueue32() on handling of 32-bit + * sigvals in a 64-bit kernel. + */ + ev.sigev_value.sival_int = ev32.sigev_value.sival_int; + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin((void *)(uintptr_t) + ev32.sigev_value.sival_ptr, + (void *)&tim_pnevp32, + sizeof (port_notify32_t))) + return (set_errno(EFAULT)); + tim_pnevp.portnfy_port = + tim_pnevp32.portnfy_port; + tim_pnevp.portnfy_user = + (void *)(uintptr_t)tim_pnevp32.portnfy_user; + } +#endif + } + switch (ev.sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) + return (set_errno(EINVAL)); + break; + case SIGEV_THREAD: + case SIGEV_PORT: + break; + default: + return (set_errno(EINVAL)); + } + } else { + /* + * Use the clock's default sigevent (this is a structure copy). + */ + ev = backend->clk_default; + } + + if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * Populate si_value with the timer ID if no sigevent was passed in. + */ + if (evp == NULL) { + it->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + timer_delete_grabbed(p, tid, it); + return (set_errno(EFAULT)); + } + + /* + * If we're here, then we have successfully created the timer; we + * just need to release the timer and return. + */ + timer_release(p, it); + + return (0); +} + + int timer_gettime(timer_t tid, itimerspec_t *val) { @@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid) void timer_lwpexit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) { continue; + } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -876,20 +1005,22 @@ timer_lwpexit(void) void timer_lwpbind() { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) continue; + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -911,16 +1042,19 @@ timer_lwpbind() void timer_exit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; ASSERT(p->p_itimer != NULL); + ASSERT(p->p_itimer_sz != 0); - for (i = 0; i < timer_max; i++) - (void) timer_delete(i); + for (i = 0; i < p->p_itimer_sz; i++) { + (void) timer_delete((timer_t)i); + } - kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *)); + kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *)); p->p_itimer = NULL; + p->p_itimer_sz = 0; } /* @@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose) for (tid = 0; tid < timer_max; tid++) { if ((it = timer_grab(p, tid)) == NULL) continue; - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portfd == port) { port_kevent_t *pev; diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 61acc6cf97..53be806026 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* @@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv) void hrt2ts(hrtime_t hrt, timestruc_t *tsp) { +#if defined(__amd64) + /* + * The cleverness explained above is unecessary on x86_64 CPUs where + * modern compilers are able to optimize down to faster operations. + */ + tsp->tv_sec = hrt / NANOSEC; + tsp->tv_nsec = hrt % NANOSEC; +#else uint32_t sec, nsec, tmp; tmp = (uint32_t)(hrt >> 30); @@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp) } tsp->tv_sec = (time_t)sec; tsp->tv_nsec = nsec; +#endif /* defined(__amd64) */ } /* * Convert from timestruc_t to hrtime_t. - * - * The code below is equivalent to: - * - * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; - * - * but requires no integer multiply. */ hrtime_t ts2hrt(const timestruc_t *tsp) { +#if defined(__amd64) || defined(__i386) + /* + * On modern x86 CPUs, the simple version is faster. + */ + return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); +#else + /* + * The code below is equivalent to: + * + * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; + * + * but requires no integer multiply. + */ hrtime_t hrt; hrt = tsp->tv_sec; @@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp) hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 9) + tsp->tv_nsec; return (hrt); +#endif /* defined(__amd64) || defined(__i386) */ } /* @@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp) void hrt2tv(hrtime_t hrt, struct timeval *tvp) { +#if defined(__amd64) + /* + * Like hrt2ts, the simple version is faster on x86_64. + */ + tvp->tv_sec = hrt / NANOSEC; + tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC); +#else uint32_t sec, nsec, tmp; uint32_t q, r, t; @@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp) sec++; } tvp->tv_sec = (time_t)sec; -/* - * this routine is very similar to hr2ts, but requires microseconds - * instead of nanoseconds, so an interger divide by 1000 routine - * completes the conversion - */ + /* + * this routine is very similar to hr2ts, but requires microseconds + * instead of nanoseconds, so an interger divide by 1000 routine + * completes the conversion + */ t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12); q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14); q = q >> 9; r = nsec - q*1000; tvp->tv_usec = q + ((r + 24) >> 10); - +#endif /* defined(__amd64) */ } int diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index 608208bbca..f5ee76a2cb 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -58,6 +59,7 @@ #include <sys/tnf_probe.h> #include <sys/mem_cage.h> #include <sys/time.h> +#include <sys/zone.h> #include <vm/hat.h> #include <vm/as.h> @@ -73,7 +75,7 @@ static int checkpage(page_t *, int); * algorithm. They are initialized to 0, and then computed at boot time * based on the size of the system. If they are patched non-zero in * a loaded vmunix they are left alone and may thus be changed per system - * using adb on the loaded system. + * using mdb on the loaded system. */ pgcnt_t slowscan = 0; pgcnt_t fastscan = 0; @@ -81,6 +83,7 @@ pgcnt_t fastscan = 0; static pgcnt_t handspreadpages = 0; static int loopfraction = 2; static pgcnt_t looppages; +/* See comment below describing 4% and 80% */ static int min_percent_cpu = 4; static int max_percent_cpu = 80; static pgcnt_t maxfastscan = 0; @@ -98,14 +101,34 @@ pgcnt_t deficit; pgcnt_t nscan; pgcnt_t desscan; +/* kstats */ +uint64_t low_mem_scan; +uint64_t zone_cap_scan; +uint64_t n_throttle; + +clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */ + /* * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks * are the number of ticks in each wakeup cycle that gives the * equivalent of some underlying %CPU duty cycle. - * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is - * awakened every 25 clock ticks. So, converting from %CPU to ticks - * per wakeup cycle would be x% of 25, that is (x * 100) / 25. - * So, for example, 4% == 1 tick and 80% == 20 ticks. + * + * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging() + * will run 4 times/sec to update pageout scanning parameters and kickoff + * the pageout_scanner() thread if necessary. + * + * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When + * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1). + * + * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When + * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 80% of a CPU + * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25 + * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec. + * + * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks + * will be 200, so the CPU percentages are the same as when hz is 100. * * min_pageout_ticks: * ticks/wakeup equivalent of min_percent_cpu. @@ -117,19 +140,29 @@ pgcnt_t desscan; * Number of clock ticks budgeted for each wakeup cycle. * Computed each time around by schedpaging(). * Varies between min_pageout_ticks .. max_pageout_ticks, - * depending on memory pressure. - * - * pageout_lbolt: - * Timestamp of the last time pageout_scanner woke up and started - * (or resumed) scanning for not recently referenced pages. + * depending on memory pressure or zones over their cap. */ static clock_t min_pageout_ticks; static clock_t max_pageout_ticks; static clock_t pageout_ticks; -static clock_t pageout_lbolt; -static uint_t reset_hands; +#define MAX_PSCAN_THREADS 16 +static boolean_t reset_hands[MAX_PSCAN_THREADS]; + +/* + * These can be tuned in /etc/system or set with mdb. + * 'des_page_scanners' is the desired number of page scanner threads. The + * system will bring the actual number of threads into line with the desired + * number. If des_page_scanners is set to an invalid value, the system will + * correct the setting. + */ +uint_t des_page_scanners; +uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */ + +uint_t n_page_scanners; +static pgcnt_t pscan_region_sz; /* informational only */ + #define PAGES_POLL_MASK 1023 @@ -145,33 +178,37 @@ static uint_t reset_hands; * pageout_sample_pages: * The accumulated number of pages scanned during sampling. * - * pageout_sample_ticks: - * The accumulated clock ticks for the sample. + * pageout_sample_etime: + * The accumulated number of nanoseconds for the sample. * * pageout_rate: - * Rate in pages/nanosecond, computed at the end of sampling. + * Rate in pages/second, computed at the end of sampling. * * pageout_new_spread: - * The new value to use for fastscan and handspreadpages. - * Calculated after enough samples have been taken. + * The new value to use for maxfastscan and (perhaps) handspreadpages. + * Intended to be the number pages that can be scanned per sec using ~10% + * of a CPU. Calculated after enough samples have been taken. + * pageout_rate / 10 */ typedef hrtime_t hrrate_t; -static uint64_t pageout_sample_lim = 4; -static uint64_t pageout_sample_cnt = 0; +static uint_t pageout_sample_lim = 4; +static uint_t pageout_sample_cnt = 0; static pgcnt_t pageout_sample_pages = 0; static hrrate_t pageout_rate = 0; static pgcnt_t pageout_new_spread = 0; -static clock_t pageout_cycle_ticks; -static hrtime_t sample_start, sample_end; static hrtime_t pageout_sample_etime = 0; +/* True if page scanner is first starting up */ +#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim) + /* * Record number of times a pageout_scanner wakeup cycle finished because it * timed out (exceeded its CPU budget), rather than because it visited - * its budgeted number of pages. + * its budgeted number of pages. This is only done when scanning under low + * free memory conditions, not when scanning for zones over their cap. */ uint64_t pageout_timeouts = 0; @@ -194,25 +231,35 @@ kcondvar_t memavail_cv; #define LOOPPAGES total_pages /* - * Set up the paging constants for the clock algorithm. - * Called after the system is initialized and the amount of memory - * and number of paging devices is known. + * Local boolean to control scanning when zones are over their cap. Avoids + * accessing the zone_num_over_cap variable except within schedpaging(), which + * only runs periodically. This is here only to reduce our access to + * zone_num_over_cap, since it is already accessed a lot during paging, and + * the page scanner accesses the zones_over variable on each page during a + * scan. There is no lock needed for zone_num_over_cap since schedpaging() + * doesn't modify the variable, it only cares if the variable is 0 or non-0. + */ +static boolean_t zones_over = B_FALSE; + +/* + * Set up the paging constants for the page scanner clock-hand algorithm. + * Called at startup after the system is initialized and the amount of memory + * and number of paging devices is known (recalc will be 0). Called again once + * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples + * (recalc will be 1). + * + * Will also be called after a memory dynamic reconfiguration operation and + * recalc will be 1 in those cases too. * - * lotsfree is 1/64 of memory, but at least 512K. + * lotsfree is 1/64 of memory, but at least 512K (ha!). * desfree is 1/2 of lotsfree. * minfree is 1/2 of desfree. - * - * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: - * - * lotsfree = btop(512K) - * desfree = btop(200K) - * minfree = btop(100K) - * throttlefree = INT_MIN - * max_percent_cpu = 4 */ void setupclock(int recalc) { + uint_t i; + pgcnt_t sz, tmp; static spgcnt_t init_lfree, init_dfree, init_mfree; static spgcnt_t init_tfree, init_preserve, init_mpgio; @@ -221,8 +268,8 @@ setupclock(int recalc) looppages = LOOPPAGES; /* - * setupclock can now be called to recalculate the paging - * parameters in the case of dynamic addition of memory. + * setupclock can be called to recalculate the paging + * parameters in the case of dynamic reconfiguration of memory. * So to make sure we make the proper calculations, if such a * situation should arise, we save away the initial values * of each parameter so we can recall them when needed. This @@ -311,105 +358,98 @@ setupclock(int recalc) maxpgio = init_mpgio; /* - * The clock scan rate varies between fastscan and slowscan - * based on the amount of free memory available. Fastscan - * rate should be set based on the number pages that can be - * scanned per sec using ~10% of processor time. Since this - * value depends on the processor, MMU, Mhz etc., it is - * difficult to determine it in a generic manner for all - * architectures. + * When the system is in a low memory state, the page scan rate varies + * between fastscan and slowscan based on the amount of free memory + * available. When only zones are over their memory cap, the scan rate + * is always fastscan. * - * Instead of trying to determine the number of pages scanned - * per sec for every processor, fastscan is set to be the smaller - * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling - * time is limited to ~4% of processor time. + * The fastscan rate should be set based on the number pages that can + * be scanned per sec using ~10% of a CPU. Since this value depends on + * the processor, MMU, Ghz etc., it must be determined dynamically. * - * Setting fastscan to be 1/2 of memory allows pageout to scan - * all of memory in ~2 secs. This implies that user pages not - * accessed within 1 sec (assuming, handspreadpages == fastscan) - * can be reclaimed when free memory is very low. Stealing pages - * not accessed within 1 sec seems reasonable and ensures that - * active user processes don't thrash. + * When the scanner first starts up, fastscan will be set to 0 and + * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages). + * However, once the scanner has collected enough samples, then fastscan + * is set to be the smaller of 1/2 of memory (looppages / loopfraction) + * or maxfastscan (which is set from pageout_new_spread). Thus, + * MAXHANDSPREADPAGES is irrelevant after the scanner is fully + * initialized. * - * Smaller values of fastscan result in scanning fewer pages - * every second and consequently pageout may not be able to free - * sufficient memory to maintain the minimum threshold. Larger - * values of fastscan result in scanning a lot more pages which - * could lead to thrashing and higher CPU usage. + * pageout_new_spread is calculated when the scanner first starts + * running. During this initial sampling period the nscan_limit + * is set to the total_pages of system memory. Thus, the scanner could + * theoretically scan all of memory in one pass. However, each sample + * is also limited by the %CPU budget. This is controlled by + * pageout_ticks which is set in schedpaging(). During the sampling + * period, pageout_ticks is set to max_pageout_ticks. This tick value + * is derived from the max_percent_cpu (80%) described above. On a + * system with more than a small amount of memory (~8GB), the scanner's + * %CPU will be the limiting factor in calculating pageout_new_spread. * - * Fastscan needs to be limited to a maximum value and should not - * scale with memory to prevent pageout from consuming too much - * time for scanning on slow CPU's and avoid thrashing, as a - * result of scanning too many pages, on faster CPU's. - * The value of 64 Meg was chosen for MAXHANDSPREADPAGES - * (the upper bound for fastscan) based on the average number - * of pages that can potentially be scanned in ~1 sec (using ~4% - * of the CPU) on some of the following machines that currently - * run Solaris 2.x: + * At the end of the sampling period, the pageout_rate indicates how + * many pages could be scanned per second. The pageout_new_spread is + * then set to be 1/10th of that (i.e. approximating 10% of a CPU). + * Of course, this value could still be more than the physical memory + * on the system. If so, fastscan is set to 1/2 of memory, as + * mentioned above. * - * average memory scanned in ~1 sec + * All of this leads up to the setting of handspreadpages, which is + * set to fastscan. This is the distance, in pages, between the front + * and back hands during scanning. It will dictate which pages will + * be considered "hot" on the backhand and which pages will be "cold" + * and reclaimed * - * 25 Mhz SS1+: 23 Meg - * LX: 37 Meg - * 50 Mhz SC2000: 68 Meg + * If the scanner is limited by desscan, then at the highest rate it + * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the + * scanner is limited by the %CPU, then at the highest rate (20% of a + * CPU per cycle) the number of pages scanned could be much less. * - * 40 Mhz 486: 26 Meg - * 66 Mhz 486: 42 Meg + * Thus, if the scanner is limited by desscan, then the handspreadpages + * setting means 1sec between the front and back hands, but if the + * scanner is limited by %CPU, it could be several seconds between the + * two hands. * - * When free memory falls just below lotsfree, the scan rate - * goes from 0 to slowscan (i.e., pageout starts running). This + * The basic assumption is that at the worst case, stealing pages + * not accessed within 1 sec seems reasonable and ensures that active + * user processes don't thrash. This is especially true when the system + * is in a low memory state. + * + * There are some additional factors to consider for the case of + * scanning when zones are over their cap. In this situation it is + * also likely that the machine will have a large physical memory which + * will take many seconds to fully scan (due to the %CPU and desscan + * limits per cycle). It is probable that there will be few (or 0) + * pages attributed to these zones in any single scanning cycle. The + * result is that reclaiming enough pages for these zones might take + * several additional seconds (this is generally not a problem since + * the zone physical cap is just a soft cap). + * + * This is similar to the typical multi-processor situation in which + * pageout is often unable to maintain the minimum paging thresholds + * under heavy load due to the fact that user processes running on + * other CPU's can be dirtying memory at a much faster pace than + * pageout can find pages to free. + * + * One potential approach to address both of these cases is to enable + * more than one CPU to run the page scanner, in such a manner that the + * various clock hands don't overlap. However, this also makes it more + * difficult to determine the values for fastscan, slowscan and + * handspreadpages. This is left as a future enhancement, if necessary. + * + * When free memory falls just below lotsfree, the scan rate goes from + * 0 to slowscan (i.e., the page scanner starts running). This * transition needs to be smooth and is achieved by ensuring that * pageout scans a small number of pages to satisfy the transient * memory demand. This is set to not exceed 100 pages/sec (25 per * wakeup) since scanning that many pages has no noticible impact * on system performance. * - * In addition to setting fastscan and slowscan, pageout is - * limited to using ~4% of the CPU. This results in increasing - * the time taken to scan all of memory, which in turn means that - * user processes have a better opportunity of preventing their - * pages from being stolen. This has a positive effect on - * interactive and overall system performance when memory demand - * is high. - * - * Thus, the rate at which pages are scanned for replacement will - * vary linearly between slowscan and the number of pages that - * can be scanned using ~4% of processor time instead of varying - * linearly between slowscan and fastscan. - * - * Also, the processor time used by pageout will vary from ~1% - * at slowscan to ~4% at fastscan instead of varying between - * ~1% at slowscan and ~10% at fastscan. - * - * The values chosen for the various VM parameters (fastscan, - * handspreadpages, etc) are not universally true for all machines, - * but appear to be a good rule of thumb for the machines we've - * tested. They have the following ranges: - * - * cpu speed: 20 to 70 Mhz - * page size: 4K to 8K - * memory size: 16M to 5G - * page scan rate: 4000 - 17400 4K pages per sec - * - * The values need to be re-examined for machines which don't - * fall into the various ranges (e.g., slower or faster CPUs, - * smaller or larger pagesizes etc) shown above. - * - * On an MP machine, pageout is often unable to maintain the - * minimum paging thresholds under heavy load. This is due to - * the fact that user processes running on other CPU's can be - * dirtying memory at a much faster pace than pageout can find - * pages to free. The memory demands could be met by enabling - * more than one CPU to run the clock algorithm in such a manner - * that the various clock hands don't overlap. This also makes - * it more difficult to determine the values for fastscan, slowscan - * and handspreadpages. - * - * The swapper is currently used to free up memory when pageout - * is unable to meet memory demands by swapping out processes. - * In addition to freeing up memory, swapping also reduces the - * demand for memory by preventing user processes from running - * and thereby consuming memory. + * The swapper is currently used to free up memory when pageout is + * unable to meet memory demands. It does this by swapping out entire + * processes. In addition to freeing up memory, swapping also reduces + * the demand for memory because the swapped out processes cannot + * run, and thereby consume memory. However, this is a pathological + * state and performance will generally be considered unacceptable. */ if (init_mfscan == 0) { if (pageout_new_spread != 0) @@ -419,12 +459,13 @@ setupclock(int recalc) } else { maxfastscan = init_mfscan; } - if (init_fscan == 0) + if (init_fscan == 0) { fastscan = MIN(looppages / loopfraction, maxfastscan); - else + } else { fastscan = init_fscan; - if (fastscan > looppages / loopfraction) - fastscan = looppages / loopfraction; + if (fastscan > looppages / loopfraction) + fastscan = looppages / loopfraction; + } /* * Set slow scan time to 1/10 the fast scan time, but @@ -444,12 +485,10 @@ setupclock(int recalc) * decreases as the scan rate rises. It must be < the amount * of pageable memory. * - * Since pageout is limited to ~4% of the CPU, setting handspreadpages - * to be "fastscan" results in the front hand being a few secs - * (varies based on the processor speed) ahead of the back hand - * at fastscan rates. This distance can be further reduced, if - * necessary, by increasing the processor time used by pageout - * to be more than ~4% and preferrably not more than ~10%. + * Since pageout is limited to the %CPU per cycle, setting + * handspreadpages to be "fastscan" results in the front hand being + * a few secs (varies based on the processor speed) ahead of the back + * hand at fastscan rates. * * As a result, user processes have a much better chance of * referencing their pages before the back hand examines them. @@ -471,29 +510,78 @@ setupclock(int recalc) if (handspreadpages >= looppages) handspreadpages = looppages - 1; + if (recalc == 0) { + /* + * Setup basic values at initialization. + */ + pscan_region_sz = total_pages; + des_page_scanners = n_page_scanners = 1; + reset_hands[0] = B_TRUE; + return; + } + /* - * If we have been called to recalculate the parameters, - * set a flag to re-evaluate the clock hand pointers. + * Recalculating + * + * We originally set the number of page scanners to 1. Now that we + * know what the handspreadpages is for a scanner, figure out how many + * scanners we should run. We want to ensure that the regions don't + * overlap and that they are not touching. + * + * A default 64GB region size is used as the initial value to calculate + * how many scanner threads we should create on lower memory systems. + * The idea is to limit the number of threads to a practical value + * (e.g. a 64GB machine really only needs one scanner thread). For very + * large memory systems, we limit ourselves to MAX_PSCAN_THREADS + * threads. + * + * The scanner threads themselves are evenly spread out around the + * memory "clock" in pageout_scanner when we reset the hands, and each + * thread will scan all of memory. */ - if (recalc) - reset_hands = 1; + sz = (btop(64ULL * 0x40000000ULL)); + if (sz < handspreadpages) { + /* + * 64GB is smaller than the separation between the front + * and back hands; use double handspreadpages. + */ + sz = handspreadpages << 1; + } + if (sz > total_pages) { + sz = total_pages; + } + /* Record region size for inspection with mdb, otherwise unused */ + pscan_region_sz = sz; + + tmp = sz; + for (i = 1; tmp < total_pages; i++) { + tmp += sz; + } + + if (i > MAX_PSCAN_THREADS) + i = MAX_PSCAN_THREADS; + + des_page_scanners = i; } /* * Pageout scheduling. * * Schedpaging controls the rate at which the page out daemon runs by - * setting the global variables nscan and desscan RATETOSCHEDPAGING - * times a second. Nscan records the number of pages pageout has examined - * in its current pass; schedpaging resets this value to zero each time - * it runs. Desscan records the number of pages pageout should examine - * in its next pass; schedpaging sets this value based on the amount of - * currently available memory. + * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING + * times a second. The pageout_ticks variable controls the percent of one + * CPU that each page scanner thread should consume (see min_percent_cpu + * and max_percent_cpu descriptions). The desscan variable records the number + * of pages pageout should examine in its next pass; schedpaging sets this + * value based on the amount of currently available memory. In addtition, the + * nscan variable records the number of pages pageout has examined in its + * current pass; schedpaging resets this value to zero each time it runs. */ -#define RATETOSCHEDPAGING 4 /* hz that is */ +#define RATETOSCHEDPAGING 4 /* times/second */ -static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ +/* held while pageout_scanner or schedpaging are modifying shared data */ +static kmutex_t pageout_mutex; /* * Pool of available async pageout putpage requests. @@ -506,7 +594,7 @@ static kcondvar_t push_cv; static int async_list_size = 256; /* number of async request structs */ -static void pageout_scanner(void); +static void pageout_scanner(void *); /* * If a page is being shared more than "po_share" times @@ -535,67 +623,153 @@ schedpaging(void *arg) if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) kcage_cageout_wakeup(); - if (mutex_tryenter(&pageout_mutex)) { - /* pageout() not running */ - nscan = 0; - vavail = freemem - deficit; - if (pageout_new_spread != 0) - vavail -= needfree; - if (vavail < 0) - vavail = 0; - if (vavail > lotsfree) - vavail = lotsfree; + (void) atomic_swap_ulong(&nscan, 0); + vavail = freemem - deficit; + if (pageout_new_spread != 0) + vavail -= needfree; + if (vavail < 0) + vavail = 0; + if (vavail > lotsfree) + vavail = lotsfree; + /* + * Fix for 1161438 (CRS SPR# 73922). All variables + * in the original calculation for desscan were 32 bit signed + * ints. As freemem approaches 0x0 on a system with 1 Gig or + * more of memory, the calculation can overflow. When this + * happens, desscan becomes negative and pageout_scanner() + * stops paging out. + */ + if ((needfree) && (pageout_new_spread == 0)) { /* - * Fix for 1161438 (CRS SPR# 73922). All variables - * in the original calculation for desscan were 32 bit signed - * ints. As freemem approaches 0x0 on a system with 1 Gig or - * more of memory, the calculation can overflow. When this - * happens, desscan becomes negative and pageout_scanner() - * stops paging out. + * If we've not yet collected enough samples to + * calculate a spread, kick into high gear anytime + * needfree is non-zero. Note that desscan will not be + * the limiting factor for systems with larger memory; + * the %CPU will limit the scan. That will also be + * maxed out below. */ - if ((needfree) && (pageout_new_spread == 0)) { - /* - * If we've not yet collected enough samples to - * calculate a spread, use the old logic of kicking - * into high gear anytime needfree is non-zero. - */ - desscan = fastscan / RATETOSCHEDPAGING; - } else { - /* - * Once we've calculated a spread based on system - * memory and usage, just treat needfree as another - * form of deficit. - */ - spgcnt_t faststmp, slowstmp, result; + desscan = fastscan / RATETOSCHEDPAGING; + } else { + /* + * Once we've calculated a spread based on system + * memory and usage, just treat needfree as another + * form of deficit. + */ + spgcnt_t faststmp, slowstmp, result; + + slowstmp = slowscan * vavail; + faststmp = fastscan * (lotsfree - vavail); + result = (slowstmp + faststmp) / + nz(lotsfree) / RATETOSCHEDPAGING; + desscan = (pgcnt_t)result; + } + + /* + * If we've not yet collected enough samples to calculate a + * spread, also kick %CPU to the max. + */ + if (pageout_new_spread == 0) { + pageout_ticks = max_pageout_ticks; + } else { + pageout_ticks = min_pageout_ticks + + (lotsfree - vavail) * + (max_pageout_ticks - min_pageout_ticks) / + nz(lotsfree); + } - slowstmp = slowscan * vavail; - faststmp = fastscan * (lotsfree - vavail); - result = (slowstmp + faststmp) / - nz(lotsfree) / RATETOSCHEDPAGING; - desscan = (pgcnt_t)result; + if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) { + /* + * We have finished the pagescan initialization and the desired + * number of page scanners has changed, either because + * initialization just finished, because of a memory DR, or + * because des_page_scanners has been modified on the fly (i.e. + * by mdb). If we need more scanners, start them now, otherwise + * the excess scanners will terminate on their own when they + * reset their hands. + */ + uint_t i; + uint_t curr_nscan = n_page_scanners; + pgcnt_t max = total_pages / handspreadpages; + + if (des_page_scanners > max) + des_page_scanners = max; + + if (des_page_scanners > MAX_PSCAN_THREADS) { + des_page_scanners = MAX_PSCAN_THREADS; + } else if (des_page_scanners == 0) { + des_page_scanners = 1; } - pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * - (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); + /* + * Each thread has its own entry in the reset_hands array, so + * we don't need any locking in pageout_scanner to check the + * thread's reset_hands entry. Thus, we use a pre-allocated + * fixed size reset_hands array and upper limit on the number + * of pagescan threads. + * + * The reset_hands entries need to be true before we start new + * scanners, but if we're reducing, we don't want a race on the + * recalculation for the existing threads, so we set + * n_page_scanners first. + */ + n_page_scanners = des_page_scanners; + for (i = 0; i < MAX_PSCAN_THREADS; i++) { + reset_hands[i] = B_TRUE; + } - if (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim) { - TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, - "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); - } else { - /* - * There are enough free pages, no need to - * kick the scanner thread. And next time - * around, keep more of the `highly shared' - * pages. - */ - cv_signal_pageout(); - if (po_share > MIN_PO_SHARE) { - po_share >>= 1; + if (des_page_scanners > curr_nscan) { + /* Create additional pageout scanner threads. */ + for (i = curr_nscan; i < des_page_scanners; i++) { + (void) lwp_kernel_create(proc_pageout, + pageout_scanner, (void *)(uintptr_t)i, + TS_RUN, curthread->t_pri); } } + } + + zones_over = B_FALSE; + + if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) { + if (!PAGE_SCAN_STARTUP) + low_mem_scan++; + DTRACE_PROBE(schedpage__wake__low); + WAKE_PAGEOUT_SCANNER(); + + } else if (zone_num_over_cap > 0) { + /* One or more zones are over their cap. */ + + /* No page limit */ + desscan = total_pages; + + /* + * Increase the scanning CPU% to the max. This implies + * 80% of one CPU/sec if the scanner can run each + * opportunity. Can also be tuned via setting + * zone_pageout_ticks in /etc/system or with mdb. + */ + pageout_ticks = (zone_pageout_ticks != 0) ? + zone_pageout_ticks : max_pageout_ticks; + + zones_over = B_TRUE; + zone_cap_scan++; + + DTRACE_PROBE(schedpage__wake__zone); + WAKE_PAGEOUT_SCANNER(); + + } else { + /* + * There are enough free pages, no need to + * kick the scanner thread. And next time + * around, keep more of the `highly shared' + * pages. + */ + cv_signal_pageout(); + + mutex_enter(&pageout_mutex); + if (po_share > MIN_PO_SHARE) { + po_share >>= 1; + } mutex_exit(&pageout_mutex); } @@ -617,36 +791,46 @@ ulong_t push_list_size; /* # of requests on pageout queue */ #define FRONT 1 #define BACK 2 -int dopageout = 1; /* must be non-zero to turn page stealing on */ +int dopageout = 1; /* /etc/system tunable to disable page reclamation */ /* * The page out daemon, which runs as process 2. * - * As long as there are at least lotsfree pages, - * this process is not run. When the number of free - * pages stays in the range desfree to lotsfree, - * this daemon runs through the pages in the loop - * at a rate determined in schedpaging(). Pageout manages - * two hands on the clock. The front hand moves through - * memory, clearing the reference bit, - * and stealing pages from procs that are over maxrss. - * The back hand travels a distance behind the front hand, - * freeing the pages that have not been referenced in the time - * since the front hand passed. If modified, they are pushed to - * swap before being freed. + * Page out occurs when either: + * a) there is less than lotsfree pages, + * b) there are one or more zones over their physical memory cap. + * + * The daemon treats physical memory as a circular array of pages and scans the + * pages using a 'two-handed clock' algorithm. The front hand moves through + * the pages, clearing the reference bit. The back hand travels a distance + * (handspreadpages) behind the front hand, freeing the pages that have not + * been referenced in the time since the front hand passed. If modified, they + * are first written to their backing store before being freed. + * + * In order to make page invalidation more responsive on machines with larger + * memory, multiple pageout_scanner threads may be created. In this case, the + * threads are evenly distributed around the the memory "clock face" so that + * memory can be reclaimed more quickly (that is, there can be large regions in + * which no pages can be reclaimed by a single thread, leading to lag which + * causes undesirable behavior such as htable stealing). + * + * As long as there are at least lotsfree pages, or no zones over their cap, + * then pageout_scanner threads are not run. When pageout_scanner threads are + * running for case (a), all pages are considered for pageout. For case (b), + * only pages belonging to a zone over its cap will be considered for pageout. * - * There are 2 threads that act on behalf of the pageout process. - * One thread scans pages (pageout_scanner) and frees them up if + * There are multiple threads that act on behalf of the pageout process. + * A set of threads scan pages (pageout_scanner) and frees them up if * they don't require any VOP_PUTPAGE operation. If a page must be * written back to its backing store, the request is put on a list * and the other (pageout) thread is signaled. The pageout thread * grabs VOP_PUTPAGE requests from the list, and processes them. * Some filesystems may require resources for the VOP_PUTPAGE * operations (like memory) and hence can block the pageout - * thread, but the scanner thread can still operate. There is still + * thread, but the pageout_scanner threads can still operate. There is still * no guarantee that memory deadlocks cannot occur. * - * For now, this thing is in very rough form. + * The pageout_scanner parameters are determined in schedpaging(). */ void pageout() @@ -684,9 +868,9 @@ pageout() pageout_pri = curthread->t_pri; - /* Create the pageout scanner thread. */ - (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, - pageout_pri - 1); + /* Create the (first) pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0, + TS_RUN, pageout_pri - 1); /* * kick off pageout scheduler. @@ -720,6 +904,7 @@ pageout() arg->a_next = NULL; mutex_exit(&push_lock); + DTRACE_PROBE(pageout__push); if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { pushes++; @@ -740,32 +925,24 @@ pageout() * Kernel thread that scans pages looking for ones to free */ static void -pageout_scanner(void) +pageout_scanner(void *a) { struct page *fronthand, *backhand; - uint_t count; + uint_t count, iter = 0; callb_cpr_t cprinfo; - pgcnt_t nscan_limit; + pgcnt_t nscan_cnt, nscan_limit; pgcnt_t pcount; + uint_t inst = (uint_t)(uintptr_t)a; + hrtime_t sample_start, sample_end; + clock_t pageout_lbolt; + kmutex_t pscan_mutex; - CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); - mutex_enter(&pageout_mutex); + VERIFY3U(inst, <, MAX_PSCAN_THREADS); - /* - * The restart case does not attempt to point the hands at roughly - * the right point on the assumption that after one circuit things - * will have settled down - and restarts shouldn't be that often. - */ + mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL); - /* - * Set the two clock hands to be separated by a reasonable amount, - * but no more than 360 degrees apart. - */ - backhand = page_first(); - if (handspreadpages >= total_pages) - fronthand = page_nextn(backhand, total_pages - 1); - else - fronthand = page_nextn(backhand, handspreadpages); + CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan"); + mutex_enter(&pscan_mutex); min_pageout_ticks = MAX(1, ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); @@ -776,71 +953,116 @@ loop: cv_signal_pageout(); CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(&proc_pageout->p_cv, &pageout_mutex); - CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); + cv_wait(&proc_pageout->p_cv, &pscan_mutex); + CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex); if (!dopageout) goto loop; - if (reset_hands) { - reset_hands = 0; + if (reset_hands[inst]) { + struct page *first; + pgcnt_t offset = total_pages / n_page_scanners; - backhand = page_first(); - if (handspreadpages >= total_pages) + reset_hands[inst] = B_FALSE; + if (inst >= n_page_scanners) { + /* + * The desired number of page scanners has been + * reduced and this instance is no longer wanted. + * Exit the lwp. + */ + VERIFY3U(inst, !=, 0); + mutex_exit(&pscan_mutex); + mutex_enter(&curproc->p_lock); + lwp_exit(); + } + + /* + * The reset case repositions the hands at the proper place + * on the memory clock face to prevent creep into another + * thread's active region or when the number of threads has + * changed. + * + * Set the two clock hands to be separated by a reasonable + * amount, but no more than 360 degrees apart. + * + * If inst == 0, backhand starts at first page, otherwise + * it is (inst * offset) around the memory "clock face" so that + * we spread out each scanner instance evenly. + */ + first = page_first(); + backhand = page_nextn(first, offset * inst); + if (handspreadpages >= total_pages) { fronthand = page_nextn(backhand, total_pages - 1); - else + } else { fronthand = page_nextn(backhand, handspreadpages); + } } + /* + * This CPU kstat is only incremented here and we're obviously on this + * CPU, so no lock. + */ CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); count = 0; - TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, - "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", - freemem, lotsfree, nscan, desscan); - /* Kernel probe */ TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree); pcount = 0; - if (pageout_sample_cnt < pageout_sample_lim) { + nscan_cnt = 0; + if (PAGE_SCAN_STARTUP) { nscan_limit = total_pages; } else { nscan_limit = desscan; } + + DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst, + page_t *, backhand, page_t *, fronthand); + pageout_lbolt = ddi_get_lbolt(); sample_start = gethrtime(); /* * Scan the appropriate number of pages for a single duty cycle. - * However, stop scanning as soon as there is enough free memory. - * For a short while, we will be sampling the performance of the - * scanner and need to keep running just to get sample data, in - * which case we keep going and don't pay attention to whether - * or not there is enough free memory. + * Only scan while at least one of these is true: + * 1) one or more zones is over its cap + * 2) there is not enough free memory + * 3) during page scan startup when determining sample data */ - - while (nscan < nscan_limit && (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim)) { + while (nscan_cnt < nscan_limit && + (zones_over || + freemem < lotsfree + needfree || + PAGE_SCAN_STARTUP)) { int rvfront, rvback; + DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst); + /* * Check to see if we have exceeded our %CPU budget * for this wakeup, but not on every single page visited, * just every once in a while. */ if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { + clock_t pageout_cycle_ticks; + pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt; if (pageout_cycle_ticks >= pageout_ticks) { - ++pageout_timeouts; + /* + * This is where we normally break out of the + * loop when scanning zones or sampling. + */ + if (!zones_over) { + atomic_inc_64(&pageout_timeouts); + } + DTRACE_PROBE1(pageout__timeout, uint_t, inst); break; } } /* * If checkpage manages to add a page to the free list, - * we give ourselves another couple of trips around the loop. + * we give ourselves another couple of trips around memory. */ if ((rvfront = checkpage(fronthand, FRONT)) == 1) count = 0; @@ -850,7 +1072,8 @@ loop: ++pcount; /* - * protected by pageout_mutex instead of cpu_stat_lock + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, scan, 1); @@ -858,7 +1081,7 @@ loop: * Don't include ineligible pages in the number scanned. */ if (rvfront != -1 || rvback != -1) - nscan++; + nscan_cnt++; backhand = page_next(backhand); @@ -868,56 +1091,89 @@ loop: */ if ((fronthand = page_next(fronthand)) == page_first()) { - TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, - "pageout_hand_wrap:freemem %ld whichhand %d", - freemem, FRONT); + DTRACE_PROBE1(pageout__wrap__front, uint_t, inst); /* - * protected by pageout_mutex instead of cpu_stat_lock + * Every 64 wraps we reposition our hands within our + * region to prevent creep into another thread. + */ + if ((++iter % pageout_reset_cnt) == 0) + reset_hands[inst] = B_TRUE; + + /* + * This CPU kstat is only incremented here and we're + * obviously on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, rev, 1); - if (++count > 1) { + + /* + * If scanning because the system is low on memory, + * then when we wraparound memory we want to try to + * reclaim more pages. + * If scanning only because zones are over their cap, + * then wrapping is common and we simply keep going. + */ + if (freemem < lotsfree + needfree && ++count > 1) { /* + * The system is low on memory. * Extremely unlikely, but it happens. - * We went around the loop at least once - * and didn't get far enough. + * We went around memory at least once + * and didn't reclaim enough. * If we are still skipping `highly shared' * pages, skip fewer of them. Otherwise, * give up till the next clock tick. */ + mutex_enter(&pageout_mutex); if (po_share < MAX_PO_SHARE) { po_share <<= 1; + mutex_exit(&pageout_mutex); } else { /* - * Really a "goto loop", but - * if someone is TRACing or - * TNF_PROBE_ing, at least - * make records to show - * where we are. + * Really a "goto loop", but if someone + * is tracing or TNF_PROBE_ing, hit + * those probes first. */ + mutex_exit(&pageout_mutex); break; } } } } + atomic_add_long(&nscan, nscan_cnt); + sample_end = gethrtime(); - TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, - "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", - freemem, lotsfree, nscan, desscan, count); + DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount, + uint_t, inst); /* Kernel probe */ TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, - tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem); + tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free, + freemem); - if (pageout_sample_cnt < pageout_sample_lim) { + /* + * The following two blocks are only relevant when the scanner is + * first started up. After the scanner runs for a while, neither of + * the conditions will ever be true again. + * + * The global variables used below are only modified by this thread and + * only during initial scanning when there is a single page scanner + * thread running. Thus, we don't use any locking. + */ + if (PAGE_SCAN_STARTUP) { + VERIFY3U(inst, ==, 0); pageout_sample_pages += pcount; pageout_sample_etime += sample_end - sample_start; ++pageout_sample_cnt; - } - if (pageout_sample_cnt >= pageout_sample_lim && - pageout_new_spread == 0) { + + } else if (pageout_new_spread == 0) { + uint_t i; + + /* + * We have run enough samples, set the spread. + */ + VERIFY3U(inst, ==, 0); pageout_rate = (hrrate_t)pageout_sample_pages * (hrrate_t)(NANOSEC) / pageout_sample_etime; pageout_new_spread = pageout_rate / 10; @@ -931,9 +1187,8 @@ loop: * Look at the page at hand. If it is locked (e.g., for physical i/o), * system (u., page table) or free, then leave it alone. Otherwise, * if we are running the front hand, turn off the page's reference bit. - * If the proc is over maxrss, we take it. If running the back hand, - * check whether the page has been reclaimed. If not, free the page, - * pushing it to disk first if necessary. + * If running the back hand, check whether the page has been reclaimed. + * If not, free the page, pushing it to disk first if necessary. * * Return values: * -1 if the page is not a candidate at all, @@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand) int isfs = 0; int isexec = 0; int pagesync_flag; + zoneid_t zid = ALL_ZONES; /* * Skip pages: @@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand) return (-1); } + if (zones_over) { + ASSERT(pp->p_zoneid == ALL_ZONES || + pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); + if (pp->p_zoneid == ALL_ZONES || + zone_pdata[pp->p_zoneid].zpers_over == 0) { + /* + * Cross-zone shared page, or zone not over it's cap. + * Leave the page alone. + */ + page_unlock(pp); + return (-1); + } + zid = pp->p_zoneid; + } + /* * Maintain statistics for what we are freeing */ @@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand) recheck: /* - * If page is referenced; make unreferenced but reclaimable. - * If this page is not referenced, then it must be reclaimable - * and we can add it to the free list. + * If page is referenced; fronthand makes unreferenced and reclaimable. + * For the backhand, a process referenced the page since the front hand + * went by, so it's not a candidate for freeing up. */ if (ppattr & P_REF) { - TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, - "pageout_isref:pp %p whichhand %d", pp, whichhand); + DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand); if (whichhand == FRONT) { - /* - * Checking of rss or madvise flags needed here... - * - * If not "well-behaved", fall through into the code - * for not referenced. - */ hat_clrref(pp); } - /* - * Somebody referenced the page since the front - * hand went by, so it's not a candidate for - * freeing up. - */ page_unlock(pp); return (0); } + /* + * This page is not referenced, so it must be reclaimable and we can + * add it to the free list. This can be done by either hand. + */ + VM_STAT_ADD(pageoutvmstats.checkpage[0]); /* @@ -1073,8 +1337,9 @@ recheck: u_offset_t offset = pp->p_offset; /* - * XXX - Test for process being swapped out or about to exit? - * [Can't get back to process(es) using the page.] + * Note: There is no possibility to test for process being + * swapped out or about to exit since we can't get back to + * process(es) from the page. */ /* @@ -1092,6 +1357,11 @@ recheck: VN_RELE(vp); return (0); } + if (isfs) { + zone_pageout_stat(zid, ZPO_DIRTY); + } else { + zone_pageout_stat(zid, ZPO_ANONDIRTY); + } return (1); } @@ -1102,8 +1372,7 @@ recheck: * the pagesync but before it was unloaded we catch it * and handle the page properly. */ - TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, - "pageout_free:pp %p whichhand %d", pp, whichhand); + DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand); (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_MOD | P_REF); if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) @@ -1120,8 +1389,10 @@ recheck: } else { CPU_STATS_ADD_K(vm, fsfree, 1); } + zone_pageout_stat(zid, ZPO_FS); } else { CPU_STATS_ADD_K(vm, anonfree, 1); + zone_pageout_stat(zid, ZPO_ANON); } return (1); /* freed a page! */ diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index 4664c52e77..1b027b4409 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1621,7 +1621,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index e89cf2c06d..ebde0d7850 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -106,14 +106,16 @@ * removed from the list of active zones. zone_destroy() returns, and * the zone can be recreated. * - * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor - * callbacks are executed, and all memory associated with the zone is - * freed. + * ZONE_IS_FREE (internal state): All references have been dropped and + * the zone_t is no longer in the zone_active nor zone_deathrow lists. + * The zone_t is in the process of being freed. This state exists + * only for publishing a sysevent to indicate that the zone by this + * name can be booted again. * - * Threads can wait for the zone to enter a requested state by using - * zone_status_wait() or zone_status_timedwait() with the desired - * state passed in as an argument. Zone state transitions are - * uni-directional; it is not possible to move back to an earlier state. + * Threads can wait for the zone to enter a requested state (other than + * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait() + * with the desired state passed in as an argument. Zone state transitions + * are uni-directional; it is not possible to move back to an earlier state. * * * Zone-Specific Data: @@ -252,6 +254,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -312,6 +316,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -327,8 +332,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -350,6 +355,7 @@ const char *zone_status_table[] = { ZONE_EVENT_SHUTTING_DOWN, /* down */ ZONE_EVENT_SHUTTING_DOWN, /* dying */ ZONE_EVENT_UNINITIALIZED, /* dead */ + ZONE_EVENT_FREE, /* free */ }; /* @@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t); static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); static int zone_set_network(zoneid_t, zone_net_data_t *); static int zone_get_network(zoneid_t, zone_net_data_t *); +static void zone_status_set(zone_t *, zone_status_t); typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); @@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + + ASSERT(MUTEX_HELD(&p->p_lock)); + q = ptob(zp->zpers_pg_cnt); + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zoneid_t zid; + uint_t pg_val; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pdata[zid].zpers_pg_limit = pg_val; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); + zp = &zone_pdata[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts; zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; return (0); @@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_init_restarts, "init_restarts", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; @@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2101,8 +2579,12 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2209,6 +2691,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2250,6 +2747,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2260,6 +2771,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2281,6 +2797,9 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_restart_init_0 = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2362,6 +2881,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2377,6 +2898,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -2390,8 +2914,30 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); + /* + * This zone_t can no longer inhibit creation of another zone_t + * with the same name or debug ID. Generate a sysevent so that + * userspace tools know it is safe to carry on. + */ + mutex_enter(&zone_status_lock); + zone_status_set(zone, ZONE_IS_FREE); + mutex_exit(&zone_status_lock); + cpu_uarray_free(zone->zone_ustate); if (zone->zone_rootvp != NULL) @@ -2436,11 +2982,17 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); - ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && - status >= zone_status_get(zone)); + ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE || + status == ZONE_IS_FREE) && status >= zone_status_get(zone)); + + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || @@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG (void) printf( "Failed to allocate and send zone state change event.\n"); +#else + /* EMPTY */ #endif } nvlist_free(nvl); @@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone) return (zone->zone_status); } +/* + * Publish a zones-related sysevent for purposes other than zone state changes. + * While it is unfortunate that zone_event_chan is associated with + * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be + * the only ones with class "status" and subclass "change". + */ +void +zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass, + nvlist_t *ev_nvl) +{ + nvlist_t *nvl = NULL; + timestruc_t now; + uint64_t t; + + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + + if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 || + nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 || + sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com", + "kernel", nvl, EVCH_SLEEP) != 0) { +#ifdef DEBUG + (void) printf("Failed to allocate and send zone misc event.\n"); +#else + /* EMPTY */ +#endif + } + nvlist_free(nvl); +} + static int zone_set_bootargs(zone_t *zone, const char *zone_bootargs) { @@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname) } static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ - uint64_t mcap; - int err = 0; - - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; - - return (err); -} - -static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; @@ -3020,6 +3599,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3766,6 +4351,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3794,9 +4390,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_restart_init_0 = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_max_swap_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); @@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root, */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * No attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID) { return (set_errno(EINVAL)); } @@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) mutex_exit(&zonehash_lock); /* - * At present most attributes can only be set on non-running, + * At present attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone->zone_restart_init = B_FALSE; err = 0; break; + case ZONE_ATTR_INITRESTART0: + zone->zone_restart_init_0 = B_TRUE; + err = 0; + break; + case ZONE_ATTR_INITREBOOT: + zone->zone_reboot_on_init_exit = B_TRUE; + err = 0; + break; case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; @@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6837,16 +7543,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; zone_t *thiszone; + /* + * Only the GZ may add a datalink to a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may add a + * datalink to a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * When links exist in the GZ, they aren't added to the GZ's + * zone_dl_list. We must enforce this because link_activate() + * depends on zone_check_datalink() returning only NGZs. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((thiszone = zone_find_by_id(zoneid)) == NULL) return (set_errno(ENXIO)); @@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; int err = 0; + /* + * Only the GZ may remove a datalink from a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may remove a + * datalink from a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * If we can't add a datalink to the GZ's zone_dl_list then we + * certainly can't remove them either. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((zone = zone_find_by_id(zoneid)) == NULL) return (set_errno(EINVAL)); @@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) } /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid. Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned. */ int zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) { zone_t *zone; + zoneid_t zoneid = *zoneidp; + zoneid_t caller = getzoneid(); int err = ENXIO; - if (*zoneidp != ALL_ZONES) { - if ((zone = zone_find_by_id(*zoneidp)) != NULL) { - if (zone_dl_exists(zone, linkid)) + /* + * Only the GZ may enquire about all zones; an NGZ may only + * enuqire about itself. + */ + if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) + zoneid = caller; + + if (zoneid != caller && caller != GLOBAL_ZONEID) + return (err); + + if (zoneid != ALL_ZONES) { + if ((zone = zone_find_by_id(zoneid)) != NULL) { + if (zone_dl_exists(zone, linkid)) { + /* + * We need to set this in case an NGZ + * passes ALL_ZONES. + */ + *zoneidp = zoneid; err = 0; + } zone_rele(zone); } return (err); } + ASSERT(caller == GLOBAL_ZONEID); mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { @@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) } } mutex_exit(&zonehash_lock); + return (err); } @@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) zone_dl_t *zdl; datalink_id_t *idptr = idarray; + /* + * Only the GZ or the owning zone may look at the datalink list. + */ + if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) + return (set_errno(EPERM)); + if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) return (set_errno(EFAULT)); if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) @@ -7388,3 +8199,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). + */ + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { + return; + } + + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpers_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_persist_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpers_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpers_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpers_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpers_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpers_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_persist_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpers_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} |