summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os')
-rw-r--r--usr/src/uts/common/os/acct.c19
-rw-r--r--usr/src/uts/common/os/brand.c185
-rw-r--r--usr/src/uts/common/os/clock_highres.c59
-rw-r--r--usr/src/uts/common/os/contract.c6
-rw-r--r--usr/src/uts/common/os/core.c4
-rw-r--r--usr/src/uts/common/os/cpu.c124
-rw-r--r--usr/src/uts/common/os/cred.c8
-rw-r--r--usr/src/uts/common/os/ddi_intr_irm.c2
-rw-r--r--usr/src/uts/common/os/exec.c121
-rw-r--r--usr/src/uts/common/os/exit.c200
-rw-r--r--usr/src/uts/common/os/fio.c35
-rw-r--r--usr/src/uts/common/os/fork.c52
-rw-r--r--usr/src/uts/common/os/grow.c35
-rw-r--r--usr/src/uts/common/os/id_space.c159
-rw-r--r--usr/src/uts/common/os/ipc.c26
-rw-r--r--usr/src/uts/common/os/kmem.c318
-rw-r--r--usr/src/uts/common/os/logsubr.c6
-rw-r--r--usr/src/uts/common/os/lwp.c131
-rw-r--r--usr/src/uts/common/os/main.c14
-rw-r--r--usr/src/uts/common/os/mmapobj.c13
-rw-r--r--usr/src/uts/common/os/pid.c27
-rw-r--r--usr/src/uts/common/os/policy.c32
-rw-r--r--usr/src/uts/common/os/priv_defs8
-rw-r--r--usr/src/uts/common/os/rctl.c32
-rw-r--r--usr/src/uts/common/os/rctl_proc.c28
-rw-r--r--usr/src/uts/common/os/sched.c15
-rw-r--r--usr/src/uts/common/os/schedctl.c18
-rw-r--r--usr/src/uts/common/os/shm.c41
-rw-r--r--usr/src/uts/common/os/sig.c93
-rw-r--r--usr/src/uts/common/os/smb_subr.c8
-rw-r--r--usr/src/uts/common/os/streamio.c75
-rw-r--r--usr/src/uts/common/os/sunddi.c14
-rw-r--r--usr/src/uts/common/os/sysent.c38
-rw-r--r--usr/src/uts/common/os/timer.c446
-rw-r--r--usr/src/uts/common/os/timers.c49
-rw-r--r--usr/src/uts/common/os/vmem.c2
-rw-r--r--usr/src/uts/common/os/zone.c699
37 files changed, 2231 insertions, 911 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c
index e598e0d08d..891c4e0836 100644
--- a/usr/src/uts/common/os/acct.c
+++ b/usr/src/uts/common/os/acct.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -47,6 +48,7 @@
#include <sys/time.h>
#include <sys/msacct.h>
#include <sys/zone.h>
+#include <sys/brand.h>
/*
* Each zone has its own accounting settings (on or off) and associated
@@ -373,7 +375,7 @@ acct_compress(ulong_t t)
* On exit, write a record on the accounting file.
*/
void
-acct(char st)
+acct(int st)
{
struct vnode *vp;
struct cred *cr;
@@ -402,6 +404,21 @@ acct(char st)
* This only gets called from exit after all lwp's have exited so no
* cred locking is needed.
*/
+
+ /* If there is a brand-specific hook, use it instead */
+ if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) {
+ ZBROP(curzone)->b_acct_out(vp, st);
+ mutex_exit(&ag->aclock);
+ return;
+ }
+
+ /*
+ * The 'st' status value was traditionally masked this way by our
+ * caller, but we now accept the unmasked value for brand handling.
+ * Zones not using the brand hook mask the status here.
+ */
+ st &= 0xff;
+
p = curproc;
ua = PTOU(p);
bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm));
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 0af67f5d98..62c3bbe2d6 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
#include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = {
};
#else /* !__sparcv9 */
struct brand_mach_ops native_mach_ops = {
- NULL, NULL, NULL, NULL
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
#endif /* !__sparcv9 */
@@ -53,7 +54,8 @@ brand_t native_brand = {
BRAND_VER_1,
"native",
NULL,
- &native_mach_ops
+ &native_mach_ops,
+ 0
};
/*
@@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)
mutex_exit(&brand_list_lock);
}
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
+ void *brand_data = NULL;
- ASSERT(bp != NULL);
- ASSERT(p->p_brand == &native_brand);
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
/*
- * We should only be called from exec(), when we know the process
- * is single-threaded.
+ * Process branding occurs during fork() and exec(). When it happens
+ * during fork(), the LWP count will always be 0 since branding is
+ * performed as part of getproc(), before LWPs have been associated.
+ * The same is not true during exec(), where a multi-LWP process may
+ * undergo branding just prior to gexec(). This is to ensure
+ * exec-related brand hooks are available. While it may seem
+ * complicated to brand a multi-LWP process, the two possible outcomes
+ * simplify things:
+ *
+ * 1. The exec() succeeds: LWPs besides the caller will be killed and
+ * any further branding will occur in a single-LWP context.
+ * 2. The exec() fails: The process will be promptly unbranded since
+ * the hooks are no longer needed.
+ *
+ * To prevent inconsistent brand state from being encountered during
+ * the exec(), LWPs beyond the caller which are associated with this
+ * process must be held temporarily. They will be released either when
+ * they are killed in the exec() success, or when the brand is cleared
+ * after exec() failure.
*/
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
+ if (lwps_ok) {
+ /*
+ * We've been called from a exec() context tolerating the
+ * existence of multiple LWPs during branding is necessary.
+ */
+ VERIFY(p == curproc);
+ VERIFY(p->p_tlist != NULL);
+ if (p->p_tlist != p->p_tlist->t_forw) {
+ /*
+ * Multiple LWPs are present. Hold all but the caller.
+ */
+ if (!holdlwps(SHOLDFORK1)) {
+ return (-1);
+ }
+ }
+ } else {
+ /*
+ * Processes branded during fork() should not have LWPs at all.
+ */
+ VERIFY(p->p_tlist == NULL);
+ }
+
+ if (bp->b_data_size > 0) {
+ brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+ }
+
+ mutex_enter(&p->p_lock);
+ ASSERT(!PROC_IS_BRANDED(p));
p->p_brand = bp;
+ p->p_brand_data = brand_data;
ASSERT(PROC_IS_BRANDED(p));
BROP(p)->b_setbrand(p);
+ mutex_exit(&p->p_lock);
+ return (0);
}
void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
- klwp_t *lwp = NULL;
- ASSERT(bp != NULL);
- ASSERT(!no_lwps || (p->p_tlist == NULL));
+ void *brand_data;
- /*
- * If called from exec_common() or proc_exit(),
- * we know the process is single-threaded.
- * If called from fork_fail, p_tlist is NULL.
- */
- if (!no_lwps) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- lwp = p->p_tlist->t_lwp;
- }
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
+ VERIFY(PROC_IS_BRANDED(p));
- ASSERT(PROC_IS_BRANDED(p));
- BROP(p)->b_proc_exit(p, lwp);
+ if (BROP(p)->b_clearbrand != NULL)
+ BROP(p)->b_clearbrand(p, lwps_ok);
+
+ mutex_enter(&p->p_lock);
p->p_brand = &native_brand;
+ brand_data = p->p_brand_data;
+ p->p_brand_data = NULL;
+
+ if (lwps_ok) {
+ VERIFY(p == curproc);
+ /*
+ * A process with multiple LWPs is being de-branded after
+ * failing an exec. The other LWPs were held as part of the
+ * procedure, so they must be resumed now.
+ */
+ if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+ continuelwps(p);
+ }
+ } else {
+ /*
+ * While clearing the brand, it's ok for one LWP to be present.
+ * This happens when a native binary is executed inside a
+ * branded zone, since the brand will be removed during the
+ * course of a successful exec.
+ */
+ VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+ }
+ mutex_exit(&p->p_lock);
+
+ if (brand_data != NULL) {
+ kmem_free(brand_data, bp->b_data_size);
+ }
}
#if defined(__sparcv9)
@@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
return (ENOSYS);
/* For all other operations this must be a branded process. */
- if (p->p_brand == &native_brand)
+ if (!PROC_IS_BRANDED(p))
return (ENOSYS);
ASSERT(p->p_brand == pbrand);
@@ -601,15 +672,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
int
brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
- cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
- char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+ cred_t *cred, int *brand_action, struct brand *pbrand, char *bname,
+ char *brandlib, char *brandlib32)
{
vnode_t *nvp;
Ehdr ehdr;
Addr uphdr_vaddr;
intptr_t voffset;
- int interp;
+ char *interp;
int i, err;
struct execenv env;
struct execenv origenv;
@@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
klwp_t *lwp = ttolwp(curthread);
brand_proc_data_t *spd;
brand_elf_data_t sed, *sedp;
- char *linker;
uintptr_t lddata; /* lddata of executable's linker */
ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
*/
if (args->to_model == DATAMODEL_NATIVE) {
args->emulator = brandlib;
- linker = brandlinker;
}
#if defined(_LP64)
else {
args->emulator = brandlib32;
- linker = brandlinker32;
}
#endif /* _LP64 */
@@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
}
#if defined(_LP64)
else {
@@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
#endif /* _LP64 */
if (err != 0) {
restoreexecenv(&origenv, &orig_sigaltstack);
+
+ if (interp != NULL)
+ kmem_free(interp, MAXPATHLEN);
+
return (err);
}
@@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
sedp->sed_phent = ehdr.e_phentsize;
sedp->sed_phnum = ehdr.e_phnum;
- if (interp) {
+ if (interp != NULL) {
if (ehdr.e_type == ET_DYN) {
/*
* This is a shared object executable, so we
@@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
* it in and store relevant information about it in the
* aux vector, where the brand library can find it.
*/
- if ((err = lookupname(linker, UIO_SYSSPACE,
+ if ((err = lookupname(interp, UIO_SYSSPACE,
FOLLOW, NULLVPP, &nvp)) != 0) {
- uprintf("%s: not found.", brandlinker);
+ uprintf("%s: not found.", interp);
restoreexecenv(&origenv, &orig_sigaltstack);
+ kmem_free(interp, MAXPATHLEN);
return (err);
}
+
+ kmem_free(interp, MAXPATHLEN);
+
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(nvp, args, &ehdr,
&uphdr_vaddr, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
}
#if defined(_LP64)
else {
@@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(nvp, args, &ehdr32,
&uphdr_vaddr32, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
/*
* Third, the /proc aux vectors set up by elfexec() point to
- * brand emulation library and it's linker. Copy these to the
+ * brand emulation library and its linker. Copy these to the
* /proc brand specific aux vector, and update the regular
- * /proc aux vectors to point to the executable (and it's
+ * /proc aux vectors to point to the executable (and its
* linker). This will enable debuggers to access the
* executable via the usual /proc or elf notes aux vectors.
*
@@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
}
/*ARGSUSED*/
-int
+void
brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
{
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand == NULL);
l->lwp_brand = (void *)-1;
- return (0);
}
/*ARGSUSED*/
void
brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
{
- proc_t *p = l->lwp_procp;
-
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand != NULL);
-
- /*
- * We should never be called for the last thread in a process.
- * (That case is handled by brand_solaris_proc_exit().)
- * Therefore this lwp must be exiting from a multi-threaded
- * process.
- */
- ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
- l->lwp_brand = NULL;
}
/*ARGSUSED*/
void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
{
ASSERT(p->p_brand == pbrand);
ASSERT(p->p_brand_data != NULL);
- /*
- * When called from proc_exit(), we know that process is
- * single-threaded and free our lwp brand data.
- * otherwise just free p_brand_data and return.
- */
- if (l != NULL) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- ASSERT(p->p_tlist->t_lwp == l);
- (void) brand_solaris_freelwp(l, pbrand);
- }
-
/* upon exit, free our proc brand data */
kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
p->p_brand_data = NULL;
@@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
ASSERT(p->p_tlist == p->p_tlist->t_forw);
p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
- (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
}
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 805813037d..1280c8a1b6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
*/
#include <sys/timer.h>
@@ -41,6 +41,9 @@
static clock_backend_t clock_highres;
+/* minimum non-privileged interval (200us) */
+long clock_highres_interval_min = 200000;
+
/*ARGSUSED*/
static int
clock_highres_settime(timespec_t *ts)
@@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)
static int
clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
{
- /*
- * CLOCK_HIGHRES timers of sufficiently high resolution can deny
- * service; only allow privileged users to create such timers.
- * Sites that do not wish to have this restriction should
- * give users the "proc_clock_highres" privilege.
- */
- if (secpolicy_clock_highres(CRED()) != 0) {
- it->it_arg = NULL;
- return (EPERM);
- }
-
it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
it->it_fire = fire;
@@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,
cpu_t *cpu;
cpupart_t *cpupart;
int pset;
+ boolean_t value_need_clamp = B_FALSE;
+ boolean_t intval_need_clamp = B_FALSE;
+ cred_t *cr = CRED();
+ struct itimerspec clamped;
+
+ /*
+ * CLOCK_HIGHRES timers of sufficiently high resolution can deny
+ * service; only allow privileged users to create such timers.
+ * Non-privileged users (those without the "proc_clock_highres"
+ * privilege) can create timers with lower resolution but if they
+ * attempt to use a very low time value (< 200us) then their
+ * timer will be clamped at 200us.
+ */
+ if (when->it_value.tv_sec == 0 &&
+ when->it_value.tv_nsec > 0 &&
+ when->it_value.tv_nsec < clock_highres_interval_min)
+ value_need_clamp = B_TRUE;
+
+ if (when->it_interval.tv_sec == 0 &&
+ when->it_interval.tv_nsec > 0 &&
+ when->it_interval.tv_nsec < clock_highres_interval_min)
+ intval_need_clamp = B_TRUE;
+
+ if ((value_need_clamp || intval_need_clamp) &&
+ secpolicy_clock_highres(cr) != 0) {
+ clamped.it_value.tv_sec = when->it_value.tv_sec;
+ clamped.it_interval.tv_sec = when->it_interval.tv_sec;
+
+ if (value_need_clamp) {
+ clamped.it_value.tv_nsec = clock_highres_interval_min;
+ } else {
+ clamped.it_value.tv_nsec = when->it_value.tv_nsec;
+ }
+
+ if (intval_need_clamp) {
+ clamped.it_interval.tv_nsec =
+ clock_highres_interval_min;
+ } else {
+ clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
+ }
+
+ when = &clamped;
+ }
cyctime.cyt_when = ts2hrt(&when->it_value);
cyctime.cyt_interval = ts2hrt(&when->it_interval);
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 909a6c2860..1a3502a710 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
@@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
avl_index_t where;
klwp_t *curlwp = ttolwp(curthread);
- ASSERT(author == curproc);
+ /*
+ * It's possible that author is not curproc if the zone is creating
+ * a new process as a child of zsched.
+ */
mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d5e272c16a..437f26e6e0 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)
/*
* Determine what rootvp to use.
*/
+ mutex_enter(&curproc->p_lock);
if (core_type == CORE_PROC) {
rootvp = (PTOU(curproc)->u_rdir == NULL ?
curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir);
@@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)
VN_HOLD(startvp);
if (rootvp != rootdir)
VN_HOLD(rootvp);
+ mutex_exit(&curproc->p_lock);
if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,
startvp, CRED())) != 0) {
pn_free(&pn);
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 87c0896814..9c7600ea24 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/*
@@ -108,7 +109,8 @@ kmutex_t cpu_lock;
cpu_t *cpu_list; /* list of all CPUs */
cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
cpu_t *cpu_active; /* list of active CPUs */
-static cpuset_t cpu_available; /* set of available CPUs */
+cpuset_t cpu_active_set; /* cached set of active CPUs */
+cpuset_t cpu_available; /* set of available CPUs */
cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */
@@ -1724,6 +1726,7 @@ cpu_list_init(cpu_t *cp)
cp->cpu_part = &cp_default;
CPUSET_ADD(cpu_available, cp->cpu_id);
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
}
/*
@@ -1895,6 +1898,7 @@ cpu_add_active_internal(cpu_t *cp)
cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
cpu_active->cpu_prev_onln->cpu_next_onln = cp;
cpu_active->cpu_prev_onln = cp;
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
if (pp->cp_cpulist) {
cp->cpu_next_part = pp->cp_cpulist;
@@ -1965,6 +1969,7 @@ cpu_remove_active(cpu_t *cp)
}
cp->cpu_next_onln = cp;
cp->cpu_prev_onln = cp;
+ CPUSET_DEL(cpu_active_set, cp->cpu_id);
cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
@@ -2704,13 +2709,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
return (0);
}
-#if CPUSET_WORDS > 1
-/*
- * Functions for implementing cpuset operations when a cpuset is more
- * than one word. On platforms where a cpuset is a single word these
- * are implemented as macros in cpuvar.h.
- */
+cpuset_t *
+cpuset_alloc(int kmflags)
+{
+ return (kmem_alloc(sizeof (cpuset_t), kmflags));
+}
+
+void
+cpuset_free(cpuset_t *s)
+{
+ kmem_free(s, sizeof (cpuset_t));
+}
void
cpuset_all(cpuset_t *s)
@@ -2722,38 +2732,61 @@ cpuset_all(cpuset_t *s)
}
void
-cpuset_all_but(cpuset_t *s, uint_t cpu)
+cpuset_all_but(cpuset_t *s, const uint_t cpu)
{
cpuset_all(s);
CPUSET_DEL(*s, cpu);
}
void
-cpuset_only(cpuset_t *s, uint_t cpu)
+cpuset_only(cpuset_t *s, const uint_t cpu)
{
CPUSET_ZERO(*s);
CPUSET_ADD(*s, cpu);
}
+long
+cpu_in_set(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ return (BT_TEST(s->cpub, cpu));
+}
+
+void
+cpuset_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_SET(s->cpub, cpu);
+}
+
+void
+cpuset_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_CLEAR(s->cpub, cpu);
+}
+
int
cpuset_isnull(cpuset_t *s)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s->cpub[i] != 0)
return (0);
+ }
return (1);
}
int
-cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
+cpuset_isequal(cpuset_t *s1, cpuset_t *s2)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s1->cpub[i] != s2->cpub[i])
return (0);
+ }
return (1);
}
@@ -2822,7 +2855,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
*smallestid = *largestid = CPUSET_NOTINSET;
}
-#endif /* CPUSET_WORDS */
+void
+cpuset_atomic_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR(s->cpub, (cpu))
+}
+
+void
+cpuset_atomic_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET(s->cpub, (cpu))
+}
+
+long
+cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+long
+cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+void
+cpuset_or(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] |= src->cpub[i];
+ }
+}
+
+void
+cpuset_xor(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] ^= src->cpub[i];
+ }
+}
+
+void
+cpuset_and(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] &= src->cpub[i];
+ }
+}
+
+void
+cpuset_zero(cpuset_t *dst)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] = 0;
+ }
+}
+
/*
* Unbind threads bound to specified CPU.
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 25727d54c5..0bd6cfd44f 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)
cr->cr_zone->zone_id);
}
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+ return (cr->cr_zone == NULL ?
+ (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+ cr->cr_zone->zone_did);
+}
+
projid_t
crgetprojid(const cred_t *cr)
{
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
/* Log callback errors */
if (ret != DDI_SUCCESS) {
- cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+ cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
ddi_driver_name(req_p->ireq_dip),
ddi_get_instance(req_p->ireq_dip), (int)action, ret);
}
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index 0065b4945b..2ab4d1f023 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -98,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */
#endif
#define PSUIDFLAGS (SNOCD|SUGID)
+#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */
/*
* These are consumed within the specific exec modules, but are defined here
@@ -256,8 +257,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
* only if the pathname does not contain a "/" the resolved path
* points to a file in the current working (attribute) directory.
*/
- if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
+ mutex_enter(&p->p_lock);
+ if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&
strchr(resolvepn.pn_path, '/') == NULL) {
+ mutex_exit(&p->p_lock);
if (dir != NULL)
VN_RELE(dir);
error = EACCES;
@@ -266,6 +269,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
VN_RELE(vp);
goto out;
}
+ mutex_exit(&p->p_lock);
bzero(exec_file, MAXCOMLEN+1);
(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
@@ -313,14 +317,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
ua.argp = argp;
ua.envp = envp;
- /* If necessary, brand this process before we start the exec. */
- if (brandme)
- brand_setbrand(p);
+ /* If necessary, brand this process/lwp before we start the exec. */
+ if (brandme) {
+ void *brand_data = NULL;
+
+ /*
+ * Process branding may fail if multiple LWPs are present and
+ * holdlwps() cannot complete successfully.
+ */
+ error = brand_setbrand(p, B_TRUE);
+
+ if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+ brand_data = BROP(p)->b_lwpdata_alloc(p);
+ if (brand_data == NULL) {
+ error = 1;
+ }
+ }
+
+ if (error == 0) {
+ mutex_enter(&p->p_lock);
+ BROP(p)->b_initlwp(lwp, brand_data);
+ mutex_exit(&p->p_lock);
+ } else {
+ VN_RELE(vp);
+ if (dir != NULL) {
+ VN_RELE(dir);
+ }
+ pn_free(&resolvepn);
+ goto fail;
+ }
+ }
if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
- exec_file, p->p_cred, brand_action)) != 0) {
- if (brandme)
- brand_clearbrand(p, B_FALSE);
+ exec_file, p->p_cred, &brand_action)) != 0) {
+ if (brandme) {
+ BROP(p)->b_freelwp(lwp);
+ brand_clearbrand(p, B_TRUE);
+ }
VN_RELE(vp);
if (dir != NULL)
VN_RELE(dir);
@@ -352,7 +385,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
/*
* Clear contract template state
*/
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_TRUE);
/*
* Save the directory in which we found the executable for expanding
@@ -376,6 +409,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
* pending held signals remain held, so don't clear t_hold.
*/
mutex_enter(&p->p_lock);
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
lwp->lwp_oldcontext = 0;
lwp->lwp_ustack = 0;
lwp->lwp_old_stk_ctl = 0;
@@ -435,8 +470,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
/* Unbrand ourself if necessary. */
- if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+ if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+ BROP(p)->b_freelwp(lwp);
brand_clearbrand(p, B_FALSE);
+ }
setregs(&args);
@@ -560,7 +597,7 @@ gexec(
long *execsz,
caddr_t exec_file,
struct cred *cred,
- int brand_action)
+ int *brand_action)
{
struct vnode *vp, *execvp = NULL;
proc_t *pp = ttoproc(curthread);
@@ -881,8 +918,14 @@ gexec(
if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
args->traceinval = 1;
}
- if (pp->p_proc_flag & P_PR_PTRACE)
+
+ /*
+ * If legacy ptrace is enabled, generate the SIGTRAP.
+ */
+ if (pp->p_proc_flag & P_PR_PTRACE) {
psignal(pp, SIGTRAP);
+ }
+
if (args->traceinval)
prinvalidate(&pp->p_user);
}
@@ -1546,6 +1589,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
return (0);
}
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+ int error;
+
+ if (STK_AVAIL(args) < sizeof (int))
+ return (E2BIG);
+ *--args->stk_offp = args->stk_strp - args->stk_base;
+
+ if (len > STK_AVAIL(args))
+ return (E2BIG);
+ bcopy(sp, args->stk_strp, len);
+
+ args->stk_strp += len;
+
+ return (0);
+}
+
static int
stk_getptr(uarg_t *args, char *src, char **dst)
{
@@ -1582,6 +1646,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
size_t size, pad;
char *argv = (char *)uap->argp;
char *envp = (char *)uap->envp;
+ uint8_t rdata[RANDOM_LEN];
/*
* Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1664,8 +1729,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
args->ne = args->na - argc;
/*
- * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
- * AT_SUN_EMULATOR strings to the stack.
+ * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+ * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+ * array, to the stack.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1678,6 +1744,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
if (args->emulator != NULL &&
(error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
return (error);
+
+ /*
+ * For the AT_RANDOM aux vector we provide 16 bytes of random
+ * data.
+ */
+ (void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+ if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+ return (error);
+
+ if (args->brand_nroot != NULL &&
+ (error = stk_add(args, args->brand_nroot,
+ UIO_SYSSPACE)) != 0)
+ return (error);
}
/*
@@ -1784,7 +1864,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
/*
* Fill in the aux vector now that we know the user stack addresses
* for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
- * AT_SUN_EMULATOR strings.
+ * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if (args->to_model == DATAMODEL_NATIVE) {
@@ -1797,6 +1877,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a,
AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a,
+ AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+ }
} else {
auxv32_t **a = (auxv32_t **)auxvpp;
ADDAUX(*a,
@@ -1809,6 +1894,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a, AT_SUN_EMULATOR,
(int)(uintptr_t)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a, AT_SUN_BRAND_NROOT,
+ (int)(uintptr_t)&ustrp[*--offp])
+ }
}
}
@@ -1935,6 +2025,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
usrstack = (char *)USRSTACK32;
}
+ if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+ usrstack = (char *)args->maxstack;
+
ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
#if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 1b9359da47..3edddcf61f 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -230,7 +230,7 @@ restart_init(int what, int why)
siginfofree(lwp->lwp_curinfo);
lwp->lwp_curinfo = NULL;
}
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_FALSE);
/*
* Reset both the process root directory and the current working
@@ -366,19 +366,6 @@ proc_exit(int why, int what)
}
mutex_exit(&p->p_lock);
- DTRACE_PROC(lwp__exit);
- DTRACE_PROC1(exit, int, why);
-
- /*
- * Will perform any brand specific proc exit processing, since this
- * is always the last lwp, will also perform lwp_exit and free brand
- * data
- */
- if (PROC_IS_BRANDED(p)) {
- lwp_detach_brand_hdlrs(lwp);
- brand_clearbrand(p, B_FALSE);
- }
-
/*
* Don't let init exit unless zone_start_init() failed its exec, or
* we are shutting down the zone or the machine.
@@ -390,12 +377,35 @@ proc_exit(int why, int what)
if (z->zone_boot_err == 0 &&
zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
- if (z->zone_restart_init == B_TRUE) {
- if (restart_init(what, why) == 0)
- return (0);
+
+ /*
+ * If the init process should be restarted, the
+ * "zone_restart_init" member will be set. Some init
+ * programs in branded zones do not tolerate a restart
+ * in the traditional manner; setting the
+ * "zone_reboot_on_init_exit" member will cause the
+ * entire zone to be rebooted instead. If neither of
+ * these flags is set the zone will shut down.
+ */
+ if (z->zone_reboot_on_init_exit == B_TRUE &&
+ z->zone_restart_init == B_TRUE) {
+ /*
+ * Trigger a zone reboot and continue
+ * with exit processing.
+ */
+ z->zone_init_status = wstat(why, what);
+ (void) zone_kadmin(A_REBOOT, 0, NULL,
+ zone_kcred());
+
} else {
+ if (z->zone_restart_init == B_TRUE) {
+ if (restart_init(what, why) == 0)
+ return (0);
+ }
+
+ z->zone_init_status = wstat(why, what);
(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
- CRED());
+ zone_kcred());
}
}
@@ -407,6 +417,32 @@ proc_exit(int why, int what)
z->zone_proc_initpid = -1;
}
+ /*
+ * Delay firing probes (and performing brand cleanup) until after the
+ * zone_proc_initpid check. Cases which result in zone shutdown or
+ * restart via zone_kadmin eventually result in a call back to
+ * proc_exit.
+ */
+ DTRACE_PROC(lwp__exit);
+ DTRACE_PROC1(exit, int, why);
+
+ /*
+ * Will perform any brand specific proc exit processing. Since this
+ * is always the last lwp, will also perform lwp exit/free and proc
+ * exit. Brand data will be freed when the process is reaped.
+ */
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_proc_exit(p);
+ /*
+ * To ensure that b_proc_exit has access to brand-specific data
+ * contained by the one remaining lwp, call the freelwp hook as
+ * the last part of this clean-up process.
+ */
+ BROP(p)->b_freelwp(lwp);
+ lwp_detach_brand_hdlrs(lwp);
+ }
+
lwp_pcb_exit();
/*
@@ -565,7 +601,7 @@ proc_exit(int why, int what)
semexit(p);
rv = wstat(why, what);
- acct(rv & 0xff);
+ acct(rv);
exacct_commit_proc(p, rv);
/*
@@ -658,10 +694,22 @@ proc_exit(int why, int what)
if ((q = p->p_child) != NULL && p != proc_init) {
struct proc *np;
struct proc *initp = proc_init;
+ pid_t zone_initpid = 1;
+ struct proc *zoneinitp = NULL;
boolean_t setzonetop = B_FALSE;
- if (!INGLOBALZONE(curproc))
- setzonetop = B_TRUE;
+ if (!INGLOBALZONE(curproc)) {
+ zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+ zoneinitp = prfind(zone_initpid);
+ if (zoneinitp != NULL) {
+ initp = zoneinitp;
+ } else {
+ zone_initpid = 1;
+ setzonetop = B_TRUE;
+ }
+ }
pgdetach(p);
@@ -673,7 +721,8 @@ proc_exit(int why, int what)
*/
delete_ns(q->p_parent, q);
- q->p_ppid = 1;
+ q->p_ppid = zone_initpid;
+
q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
if (setzonetop) {
mutex_enter(&q->p_lock);
@@ -847,8 +896,50 @@ proc_exit(int why, int what)
mutex_exit(&p->p_lock);
if (!evaporate) {
- p->p_pidflag &= ~CLDPEND;
- sigcld(p, sqp);
+ /*
+ * The brand specific code only happens when the brand has a
+ * function to call in place of sigcld and the parent of the
+ * exiting process is not the global zone init. If the parent
+ * is the global zone init, then the process was reparented,
+ * and we don't want brand code delivering possibly strange
+ * signals to init. Also, init is not branded, so any brand
+ * specific exit data will not be picked up by init anyway.
+ */
+ if (PROC_IS_BRANDED(p) &&
+ BROP(p)->b_exit_with_sig != NULL &&
+ p->p_ppid != 1) {
+ /*
+ * The code for _fini that could unload the brand_t
+ * blocks until the count of zones using the module
+ * reaches zero. Zones decrement the refcount on their
+ * brands only after all user tasks in that zone have
+ * exited and been waited on. The decrement on the
+ * brand's refcount happen in zone_destroy(). That
+ * depends on zone_shutdown() having been completed.
+ * zone_shutdown() includes a call to zone_empty(),
+ * where the zone waits for itself to reach the state
+ * ZONE_IS_EMPTY. This state is only set in either
+ * zone_shutdown(), when there are no user processes as
+ * the zone enters this function, or in
+ * zone_task_rele(). zone_task_rele() is called from
+ * code triggered by waiting on processes, not by the
+ * processes exiting through proc_exit(). This means
+ * all the branded processes that could exist for a
+ * specific brand_t must exit and get reaped before the
+ * refcount on the brand_t can reach 0. _fini will
+ * never unload the corresponding brand module before
+ * proc_exit finishes execution for all processes
+ * branded with a particular brand_t, which makes the
+ * operation below safe to do. Brands that wish to use
+ * this mechanism must wait in _fini as described
+ * above.
+ */
+ BROP(p)->b_exit_with_sig(p, sqp);
+ } else {
+ p->p_pidflag &= ~CLDPEND;
+ sigcld(p, sqp);
+ }
+
} else {
/*
* Do what sigcld() would do if the disposition
@@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
int
waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
{
- int found;
proc_t *cp, *pp;
- int proc_gone;
int waitflag = !(options & WNOWAIT);
+ boolean_t have_brand_helper = B_FALSE;
/*
* Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
pp = ttoproc(curthread);
/*
- * lock parent mutex so that sibling chain can be searched.
+ * Anytime you are looking for a process, you take pidlock to prevent
+ * things from changing as you look.
*/
mutex_enter(&pidlock);
@@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
return (ECHILD);
}
- while (pp->p_child != NULL) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+ have_brand_helper = B_TRUE;
+ }
+
+ while (pp->p_child != NULL || have_brand_helper) {
+ boolean_t brand_wants_wait = B_FALSE;
+ int proc_gone = 0;
+ int found = 0;
- proc_gone = 0;
+ /*
+ * Give the brand a chance to return synthetic results from
+ * this waitid() call before we do the real thing.
+ */
+ if (have_brand_helper) {
+ int ret;
+ if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+ &brand_wants_wait, &ret) == 0) {
+ mutex_exit(&pidlock);
+ return (ret);
+ }
+
+ if (pp->p_child == NULL) {
+ goto no_real_children;
+ }
+ }
+
+ /*
+ * Look for interesting children in the newstate list.
+ */
+ VERIFY(pp->p_child != NULL);
for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
continue;
@@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
@@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* Wow! None of the threads on the p_sibling_ns list were
* interesting threads. Check all the kids!
*/
- found = 0;
for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
if (idtype == P_PID && id != cp->p_pid)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
case CLD_TRAPPED:
@@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
break;
}
+no_real_children:
/*
* If we found no interesting processes at all,
* break out and return ECHILD.
*/
- if (found + proc_gone == 0)
+ if (!brand_wants_wait && (found + proc_gone == 0))
break;
if (options & WNOHANG) {
@@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* change state while we wait, we don't wait at all.
* Get out with ECHILD according to SVID.
*/
- if (found == proc_gone)
+ if (!brand_wants_wait && (found == proc_gone))
break;
if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1354,12 @@ freeproc(proc_t *p)
p->p_killsqp = NULL;
}
+ /* Clear any remaining brand data */
+ if (PROC_IS_BRANDED(p)) {
+ brand_clearbrand(p, B_FALSE);
+ }
+
+
prfree(p); /* inform /proc */
/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..41e7e63d2b 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc.
+ * Copyright 2017, Joyent Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -386,6 +386,7 @@ flist_grow(int maxfd)
dst->uf_flag = src->uf_flag;
dst->uf_busy = src->uf_busy;
dst->uf_portfd = src->uf_portfd;
+ dst->uf_gen = src->uf_gen;
}
/*
@@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */
afd->a_fd[i] = -1;
}
-static void
+void
set_active_fd(int fd)
{
afd_t *afd = &curthread->t_activefd;
@@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd)
}
/*
- * Convert a user supplied file descriptor into a pointer to a file
- * structure. Only task is to check range of the descriptor (soft
- * resource limit was enforced at open time and shouldn't be checked
- * here).
+ * Convert a user supplied file descriptor into a pointer to a file structure.
+ * Only task is to check range of the descriptor (soft resource limit was
+ * enforced at open time and shouldn't be checked here).
*/
file_t *
-getf(int fd)
+getf_gen(int fd, uf_entry_gen_t *genp)
{
uf_info_t *fip = P_FINFO(curproc);
uf_entry_t *ufp;
@@ -607,6 +607,9 @@ getf(int fd)
return (NULL);
}
ufp->uf_refcnt++;
+ if (genp != NULL) {
+ *genp = ufp->uf_gen;
+ }
set_active_fd(fd); /* record the active file descriptor */
@@ -615,6 +618,12 @@ getf(int fd)
return (fp);
}
+file_t *
+getf(int fd)
+{
+ return (getf_gen(fd, NULL));
+}
+
/*
* Close whatever file currently occupies the file descriptor slot
* and install the new file, usually NULL, in the file descriptor slot.
@@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp)
ASSERT(ufp->uf_flag == 0);
fd_reserve(fip, fd, 1);
ufp->uf_file = newfp;
+ ufp->uf_gen++;
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (0);
@@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
*/
cfip->fi_nfiles = nfiles = flist_minsize(pfip);
- cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+ cfip->fi_list = nfiles == 0 ? NULL :
+ kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
fd++, pufp++, cufp++) {
@@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
cufp->uf_alloc = pufp->uf_alloc;
cufp->uf_flag = pufp->uf_flag;
cufp->uf_busy = pufp->uf_busy;
+ cufp->uf_gen = pufp->uf_gen;
if (pufp->uf_file == NULL) {
ASSERT(pufp->uf_flag == 0);
if (pufp->uf_busy) {
@@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp)
fd_reserve(fip, fd, 1);
ASSERT(ufp->uf_file == NULL);
ufp->uf_file = fp;
+ if (fp != NULL) {
+ ufp->uf_gen++;
+ }
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (fd);
@@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp)
} else {
UF_ENTER(ufp, fip, fd);
ASSERT(ufp->uf_busy);
+ ufp->uf_gen++;
}
ASSERT(ufp->uf_fpollinfo == NULL);
ASSERT(ufp->uf_flag == 0);
@@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp)
error = EBADF;
else {
vnode_t *vp = fp->f_vnode;
- int flag = fp->f_flag |
- ((fp->f_flag2 & ~FEPOLLED) << 16);
+ int flag = fp->f_flag | (fp->f_flag2 << 16);
/*
* BSD fcntl() FASYNC compatibility.
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index a63931459f..7e198910b4 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
static int getproc(proc_t **, pid_t, uint_t);
#define GETPROC_USER 0x0
#define GETPROC_KERNEL 0x1
+#define GETPROC_ZSCHED 0x2
static void fork_fail(proc_t *);
static void forklwp_fail(proc_t *);
@@ -705,7 +706,7 @@ fork_fail(proc_t *cp)
if (PTOU(curproc)->u_cwd)
refstr_rele(PTOU(curproc)->u_cwd);
if (PROC_IS_BRANDED(cp)) {
- brand_clearbrand(cp, B_TRUE);
+ brand_clearbrand(cp, B_FALSE);
}
}
@@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)
kmem_free(t->t_door, sizeof (door_data_t));
t->t_door = NULL;
}
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
/*
* Remove the thread from the all threads list.
@@ -791,6 +792,9 @@ extern struct as kas;
/*
* fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
*/
int
newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
ASSERT(pid != 1);
+ ASSERT(pid >= 0);
if (getproc(&p, pid, GETPROC_KERNEL) < 0)
return (EAGAIN);
@@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
task_t *tk, *tk_old;
klwp_t *lwp;
+ boolean_t pzsched = B_FALSE;
+ int flag = GETPROC_USER;
+
+ /* Handle a new user-level thread as child of zsched. */
+ if (pid < 0) {
+ VERIFY(curzone != global_zone);
+ flag = GETPROC_ZSCHED;
+ pzsched = B_TRUE;
+ pid = 0;
+ }
- if (getproc(&p, pid, GETPROC_USER) < 0)
+ if (getproc(&p, pid, flag) < 0)
return (EAGAIN);
/*
* init creates a new task, distinct from the task
@@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
}
t = lwptot(lwp);
- ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+ ctp = contract_process_fork(sys_process_tmpl, p,
+ (pzsched ? curproc->p_zone->zone_zsched : curproc),
B_FALSE);
ASSERT(ctp != NULL);
if (ct != NULL)
@@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
return (-1); /* no point in starting new processes */
- pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ if (flags & GETPROC_ZSCHED) {
+ pp = curproc->p_zone->zone_zsched;
+ } else {
+ pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ }
task = pp->p_task;
proj = task->tk_proj;
zone = pp->p_zone;
@@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_t1_lgrpid = LGRP_NONE;
cp->p_tr_lgrpid = LGRP_NONE;
+ /* Default to native brand initially */
+ cp->p_brand = &native_brand;
+
if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
if (nproc == v.v_proc) {
CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
cp->p_sessp = pp->p_sessp;
sess_hold(pp);
- cp->p_brand = pp->p_brand;
- if (PROC_IS_BRANDED(pp))
- BROP(pp)->b_copy_procdata(cp, pp);
cp->p_bssbase = pp->p_bssbase;
cp->p_brkbase = pp->p_brkbase;
cp->p_brksize = pp->p_brksize;
@@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
mutex_exit(&cp->p_lock);
mutex_exit(&pidlock);
+ if (PROC_IS_BRANDED(pp)) {
+ /*
+ * The only reason why process branding should fail is when
+ * the procedure is complicated by multiple LWPs on the scene.
+ * With an LWP count of 0, this newly allocated process has no
+ * reason to fail branding.
+ */
+ VERIFY0(brand_setbrand(cp, B_FALSE));
+
+ BROP(pp)->b_copy_procdata(cp, pp);
+ }
+
avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
offsetof(contract_t, ct_ctlist));
@@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
*/
fcnt_add(P_FINFO(pp), 1);
+ mutex_enter(&pp->p_lock);
if (PTOU(pp)->u_cdir) {
VN_HOLD(PTOU(pp)->u_cdir);
} else {
@@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
VN_HOLD(PTOU(pp)->u_rdir);
if (PTOU(pp)->u_cwd)
refstr_hold(PTOU(pp)->u_cwd);
+ mutex_exit(&pp->p_lock);
/*
* copy the parent's uarea.
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index 647bca2542..a3de80259f 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -19,7 +19,10 @@
* CDDL HEADER END
*/
-/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
+/*
+ * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
@@ -52,6 +55,7 @@
#include <sys/fcntl.h>
#include <sys/lwpchan_impl.h>
#include <sys/nbmlock.h>
+#include <sys/brand.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -540,6 +544,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
return (0);
}
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+ if (flags & _MAP_LOW32) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+ return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+ } else {
+ return ((caddr_t)_userlimit32);
+ }
+ }
+
+ return (as->a_userlimit);
+}
+
/*
* Used for MAP_ANON - fast way to get anonymous pages
@@ -555,8 +573,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
return (EACCES);
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -565,9 +581,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(as->a_proc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
@@ -750,8 +765,6 @@ smmap_common(caddr_t *addrp, size_t len,
* If the user specified an address, do some simple checks here
*/
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -759,10 +772,8 @@ smmap_common(caddr_t *addrp, size_t len,
*/
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
-
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(curproc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- * The id_space_t provides a simple implementation of a managed range of
- * integer identifiers using a vmem arena. An ID space guarantees that the
- * next identifer returned by an allocation is larger than the previous one,
- * unless there are no larger slots remaining in the range. In this case,
- * the ID space will return the first available slot in the lower part of the
- * range (viewing the previous identifier as a partitioning element). If no
- * slots are available, id_alloc()/id_allocff() will sleep until an
- * identifier becomes available. Accordingly, id_space allocations must be
- * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/
- * id_allocff_nosleep() will return -1 if no slots are available or if the
- * system is low on memory. If id_alloc_nosleep() fails, callers should
- * not try to extend the ID space. This is to avoid making a possible
- * low-memory situation worse.
- *
- * As an ID space is designed for representing a range of id_t's, there
- * is a preexisting maximal range: [0, MAXUID]. ID space requests outside
- * that range will fail on a DEBUG kernel. The id_allocff*() functions
- * return the first available id, and should be used when there is benefit
- * to having a compact allocated range.
- *
- * (Presently, the id_space_t abstraction supports only direct allocations; ID
- * reservation, in which an ID is allocated but placed in a internal
- * dictionary for later use, should be added when a consuming subsystem
- * arrives.)
- */
-
-#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
- ASSERT(low >= 0);
- ASSERT(low < high);
-
- return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
- NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
- vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
- (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
- void *minaddr = ID_TO_ADDR(id);
- void *maxaddr = ID_TO_ADDR(id + 1);
-
- /*
- * Note that even though we're vmem_free()ing this later, it
- * should be OK, since there's no quantum cache.
- */
- return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
- minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
- vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 9381019cd1..6a6f5d84ef 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
(IPC_ZONE_USAGE(perm, service) == 0)));
}
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+ ASSERT(service->ipcs_count > 0);
+ ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+ ipc_remove(service, perm);
+ mutex_exit(&service->ipcs_lock);
+
+ /* perform any per-service removal actions */
+ service->ipcs_rmid(perm);
+
+ ipc_rele(service, perm);
+}
/*
* Common code to perform an IPC_RMID. Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
/*
* Nothing can fail from this point on.
*/
- ipc_remove(service, perm);
- mutex_exit(&service->ipcs_lock);
-
- /* perform any per-service removal actions */
- service->ipcs_rmid(perm);
-
- ipc_rele(service, perm);
+ ipc_rmsvc(service, perm);
return (0);
}
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index 9187b4bdeb..1243d0fbbf 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -159,10 +160,22 @@
* find known objects and is about to free it, or
* c) the client has freed the object.
* In all these cases (a, b, and c) kmem frees the new object (the
- * unused copy destination) and searches for the old object in the
- * magazine layer. If found, the object is removed from the magazine
- * layer and freed to the slab layer so it will no longer hold the
- * slab hostage.
+ * unused copy destination). In the first case, the object is in
+ * use and the correct action is that for LATER; in the latter two
+ * cases, we know that the object is either freed or about to be
+ * freed, in which case it is either already in a magazine or about
+ * to be in one. In these cases, we know that the object will either
+ * be reallocated and reused, or it will end up in a full magazine
+ * that will be reaped (thereby liberating the slab). Because it
+ * is prohibitively expensive to differentiate these cases, and
+ * because the defrag code is executed when we're low on memory
+ * (thereby biasing the system to reclaim full magazines) we treat
+ * all DONT_KNOW cases as LATER and rely on cache reaping to
+ * generally clean up full magazines. While we take the same action
+ * for these cases, we maintain their semantic distinction: if
+ * defragmentation is not occurring, it is useful to know if this
+ * is due to objects in use (LATER) or objects in an unknown state
+ * of transition (DONT_KNOW).
*
* 2.3 Object States
*
@@ -285,10 +298,10 @@
* view of the slab layer, making it a candidate for the move callback. Most
* objects unrecognized by the client in the move callback fall into this
* category and are cheaply distinguished from known objects by the test
- * described earlier. Since recognition is cheap for the client, and searching
- * magazines is expensive for kmem, kmem defers searching until the client first
- * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
- * elsewhere does what it can to avoid bothering the client unnecessarily.
+ * described earlier. Because searching magazines is prohibitively expensive
+ * for kmem, clients that do not mark freed objects (and therefore return
+ * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
+ * efficacy reduced.
*
* Invalidating the designated pointer member before freeing the object marks
* the object to be avoided in the callback, and conversely, assigning a valid
@@ -998,6 +1011,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
size_t kmem_content_log_size; /* content log size [2% of memory] */
size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1005,6 +1019,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
+#ifdef DEBUG
+int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */
+
#ifdef _LP64
size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
#else
@@ -1038,21 +1060,7 @@ static vmem_t *kmem_default_arena;
static vmem_t *kmem_firewall_va_arena;
static vmem_t *kmem_firewall_arena;
-/*
- * Define KMEM_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define KMEM_STATS
-#endif /* DEBUG */
-
-#ifdef KMEM_STATS
-#define KMEM_STAT_ADD(stat) ((stat)++)
-#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++))
-#else
-#define KMEM_STAT_ADD(stat) /* nothing */
-#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */
-#endif /* KMEM_STATS */
+static int kmem_zerosized; /* # of zero-sized allocs */
/*
* kmem slab consolidator thresholds (tunables)
@@ -1071,47 +1079,6 @@ size_t kmem_reclaim_max_slabs = 1;
*/
size_t kmem_reclaim_scan_range = 12;
-#ifdef KMEM_STATS
-static struct {
- uint64_t kms_callbacks;
- uint64_t kms_yes;
- uint64_t kms_no;
- uint64_t kms_later;
- uint64_t kms_dont_need;
- uint64_t kms_dont_know;
- uint64_t kms_hunt_found_mag;
- uint64_t kms_hunt_found_slab;
- uint64_t kms_hunt_alloc_fail;
- uint64_t kms_hunt_lucky;
- uint64_t kms_notify;
- uint64_t kms_notify_callbacks;
- uint64_t kms_disbelief;
- uint64_t kms_already_pending;
- uint64_t kms_callback_alloc_fail;
- uint64_t kms_callback_taskq_fail;
- uint64_t kms_endscan_slab_dead;
- uint64_t kms_endscan_slab_destroyed;
- uint64_t kms_endscan_nomem;
- uint64_t kms_endscan_refcnt_changed;
- uint64_t kms_endscan_nomove_changed;
- uint64_t kms_endscan_freelist;
- uint64_t kms_avl_update;
- uint64_t kms_avl_noupdate;
- uint64_t kms_no_longer_reclaimable;
- uint64_t kms_notify_no_longer_reclaimable;
- uint64_t kms_notify_slab_dead;
- uint64_t kms_notify_slab_destroyed;
- uint64_t kms_alloc_fail;
- uint64_t kms_constructor_fail;
- uint64_t kms_dead_slabs_freed;
- uint64_t kms_defrags;
- uint64_t kms_scans;
- uint64_t kms_scan_depot_ws_reaps;
- uint64_t kms_debug_reaps;
- uint64_t kms_debug_scans;
-} kmem_move_stats;
-#endif /* KMEM_STATS */
-
/* consolidator knobs */
boolean_t kmem_move_noreap;
boolean_t kmem_move_blocked;
@@ -1142,6 +1109,7 @@ kmem_log_header_t *kmem_transaction_log;
kmem_log_header_t *kmem_content_log;
kmem_log_header_t *kmem_failure_log;
kmem_log_header_t *kmem_slab_log;
+kmem_log_header_t *kmem_zerosized_log;
static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
@@ -1922,15 +1890,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf)
cp->cache_complete_slab_count--;
avl_add(&cp->cache_partial_slabs, sp);
} else {
-#ifdef DEBUG
- if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
- KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
- } else {
- KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
- }
-#else
(void) avl_update_gt(&cp->cache_partial_slabs, sp);
-#endif
}
ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
@@ -2964,8 +2924,33 @@ kmem_alloc(size_t size, int kmflag)
/* fall through to kmem_cache_alloc() */
} else {
- if (size == 0)
+ if (size == 0) {
+ if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+ return (NULL);
+
+ /*
+ * If this is a sleeping allocation or one that has
+ * been specified to panic on allocation failure, we
+ * consider it to be deprecated behavior to allocate
+ * 0 bytes. If we have been configured to panic under
+ * this condition, we panic; if to warn, we warn -- and
+ * regardless, we log to the kmem_zerosized_log that
+ * that this condition has occurred (which gives us
+ * enough information to be able to debug it).
+ */
+ if (kmem_panic && kmem_panic_zerosized)
+ panic("attempted to kmem_alloc() size of 0");
+
+ if (kmem_warn_zerosized) {
+ cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+ "allocation with size of 0; "
+ "see kmem_zerosized_log for details");
+ }
+
+ kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
return (NULL);
+ }
buf = vmem_alloc(kmem_oversize_arena, size,
kmflag & KM_VMFLAGS);
@@ -3579,7 +3564,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw)
kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
- kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found;
+ kmcp->kmc_move_hunt_found.value.ui64 = 0;
kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
@@ -4150,7 +4135,8 @@ kmem_cache_destroy(kmem_cache_t *cp)
if (kmem_taskq != NULL)
taskq_wait(kmem_taskq);
- if (kmem_move_taskq != NULL)
+
+ if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
taskq_wait(kmem_move_taskq);
kmem_cache_magazine_purge(cp);
@@ -4488,8 +4474,8 @@ kmem_init(void)
}
kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+ kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
/*
* Initialize STREAMS message caches so allocb() is available.
@@ -4677,94 +4663,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)
(sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
}
-static void *
-kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
- void *tbuf)
-{
- int i; /* magazine round index */
-
- for (i = 0; i < n; i++) {
- if (buf == m->mag_round[i]) {
- if (cp->cache_flags & KMF_BUFTAG) {
- (void) kmem_cache_free_debug(cp, tbuf,
- caller());
- }
- m->mag_round[i] = tbuf;
- return (buf);
- }
- }
-
- return (NULL);
-}
-
-/*
- * Hunt the magazine layer for the given buffer. If found, the buffer is
- * removed from the magazine layer and returned, otherwise NULL is returned.
- * The state of the returned buffer is freed and constructed.
- */
-static void *
-kmem_hunt_mags(kmem_cache_t *cp, void *buf)
-{
- kmem_cpu_cache_t *ccp;
- kmem_magazine_t *m;
- int cpu_seqid;
- int n; /* magazine rounds */
- void *tbuf; /* temporary swap buffer */
-
- ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
-
- /*
- * Allocated a buffer to swap with the one we hope to pull out of a
- * magazine when found.
- */
- tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
- if (tbuf == NULL) {
- KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
- return (NULL);
- }
- if (tbuf == buf) {
- KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
- if (cp->cache_flags & KMF_BUFTAG) {
- (void) kmem_cache_free_debug(cp, buf, caller());
- }
- return (buf);
- }
-
- /* Hunt the depot. */
- mutex_enter(&cp->cache_depot_lock);
- n = cp->cache_magtype->mt_magsize;
- for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
- if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
- mutex_exit(&cp->cache_depot_lock);
- return (buf);
- }
- }
- mutex_exit(&cp->cache_depot_lock);
-
- /* Hunt the per-CPU magazines. */
- for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
- ccp = &cp->cache_cpu[cpu_seqid];
-
- mutex_enter(&ccp->cc_lock);
- m = ccp->cc_loaded;
- n = ccp->cc_rounds;
- if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
- mutex_exit(&ccp->cc_lock);
- return (buf);
- }
- m = ccp->cc_ploaded;
- n = ccp->cc_prounds;
- if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
- mutex_exit(&ccp->cc_lock);
- return (buf);
- }
- mutex_exit(&ccp->cc_lock);
- }
-
- kmem_cache_free(cp, tbuf);
- return (NULL);
-}
-
/*
* May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
* or when the buffer is freed.
@@ -4828,7 +4726,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
* NO kmem frees the new buffer, marks the slab of the old buffer
* non-reclaimable to avoid bothering the client again
* LATER kmem frees the new buffer, increments slab_later_count
- * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer
+ * DONT_KNOW kmem frees the new buffer
* DONT_NEED kmem frees both the old buffer and the new buffer
*
* The pending callback argument now being processed contains both of the
@@ -4862,19 +4760,14 @@ kmem_move_buffer(kmem_move_t *callback)
* another buffer on the same slab.
*/
if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
- KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
- KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
- kmem_move_stats.kms_notify_no_longer_reclaimable);
kmem_slab_free(cp, callback->kmm_to_buf);
kmem_move_end(cp, callback);
return;
}
/*
- * Hunting magazines is expensive, so we'll wait to do that until the
- * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
- * is cheap, so we might as well do that here in case we can avoid
- * bothering the client.
+ * Checking the slab layer is easy, so we might as well do that here
+ * in case we can avoid bothering the client.
*/
mutex_enter(&cp->cache_lock);
free_on_slab = (kmem_slab_allocated(cp, sp,
@@ -4882,7 +4775,6 @@ kmem_move_buffer(kmem_move_t *callback)
mutex_exit(&cp->cache_lock);
if (free_on_slab) {
- KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
kmem_slab_free(cp, callback->kmm_to_buf);
kmem_move_end(cp, callback);
return;
@@ -4894,7 +4786,6 @@ kmem_move_buffer(kmem_move_t *callback)
*/
if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
KM_NOSLEEP, 1, caller()) != 0) {
- KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
kmem_move_end(cp, callback);
return;
}
@@ -4902,15 +4793,11 @@ kmem_move_buffer(kmem_move_t *callback)
cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
KM_NOSLEEP) != 0) {
atomic_inc_64(&cp->cache_alloc_fail);
- KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
kmem_slab_free(cp, callback->kmm_to_buf);
kmem_move_end(cp, callback);
return;
}
- KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
- KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
- kmem_move_stats.kms_notify_callbacks);
cp->cache_defrag->kmd_callbacks++;
cp->cache_defrag->kmd_thread = curthread;
cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
@@ -4928,7 +4815,6 @@ kmem_move_buffer(kmem_move_t *callback)
cp->cache_defrag->kmd_to_buf = NULL;
if (response == KMEM_CBRC_YES) {
- KMEM_STAT_ADD(kmem_move_stats.kms_yes);
cp->cache_defrag->kmd_yes++;
kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
/* slab safe to access until kmem_move_end() */
@@ -4943,14 +4829,12 @@ kmem_move_buffer(kmem_move_t *callback)
switch (response) {
case KMEM_CBRC_NO:
- KMEM_STAT_ADD(kmem_move_stats.kms_no);
cp->cache_defrag->kmd_no++;
mutex_enter(&cp->cache_lock);
kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
mutex_exit(&cp->cache_lock);
break;
case KMEM_CBRC_LATER:
- KMEM_STAT_ADD(kmem_move_stats.kms_later);
cp->cache_defrag->kmd_later++;
mutex_enter(&cp->cache_lock);
if (!KMEM_SLAB_IS_PARTIAL(sp)) {
@@ -4959,7 +4843,6 @@ kmem_move_buffer(kmem_move_t *callback)
}
if (++sp->slab_later_count >= KMEM_DISBELIEF) {
- KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
} else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
@@ -4968,7 +4851,6 @@ kmem_move_buffer(kmem_move_t *callback)
mutex_exit(&cp->cache_lock);
break;
case KMEM_CBRC_DONT_NEED:
- KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
cp->cache_defrag->kmd_dont_need++;
kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
if (sp->slab_refcnt == 0)
@@ -4978,19 +4860,21 @@ kmem_move_buffer(kmem_move_t *callback)
mutex_exit(&cp->cache_lock);
break;
case KMEM_CBRC_DONT_KNOW:
- KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
+ /*
+ * If we don't know if we can move this buffer or not, we'll
+ * just assume that we can't: if the buffer is in fact free,
+ * then it is sitting in one of the per-CPU magazines or in
+ * a full magazine in the depot layer. Either way, because
+ * defrag is induced in the same logic that reaps a cache,
+ * it's likely that full magazines will be returned to the
+ * system soon (thereby accomplishing what we're trying to
+ * accomplish here: return those magazines to their slabs).
+ * Given this, any work that we might do now to locate a buffer
+ * in a magazine is wasted (and expensive!) work; we bump
+ * a counter in this case and otherwise assume that we can't
+ * move it.
+ */
cp->cache_defrag->kmd_dont_know++;
- if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
- KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
- cp->cache_defrag->kmd_hunt_found++;
- kmem_slab_free_constructed(cp, callback->kmm_from_buf,
- B_TRUE);
- if (sp->slab_refcnt == 0)
- cp->cache_defrag->kmd_slabs_freed++;
- mutex_enter(&cp->cache_lock);
- kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
- mutex_exit(&cp->cache_lock);
- }
break;
default:
panic("'%s' (%p) unexpected move callback response %d\n",
@@ -5015,10 +4899,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
- if (callback == NULL) {
- KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
+
+ if (callback == NULL)
return (B_FALSE);
- }
callback->kmm_from_slab = sp;
callback->kmm_from_buf = buf;
@@ -5043,7 +4926,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
pending->kmm_flags |= KMM_DESPERATE;
}
mutex_exit(&cp->cache_lock);
- KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
kmem_cache_free(kmem_move_cache, callback);
return (B_TRUE);
}
@@ -5057,7 +4939,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
callback, TQ_NOSLEEP)) {
- KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
mutex_enter(&cp->cache_lock);
avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
mutex_exit(&cp->cache_lock);
@@ -5103,7 +4984,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
cp->cache_slab_destroy++;
mutex_exit(&cp->cache_lock);
kmem_slab_destroy(cp, sp);
- KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
mutex_enter(&cp->cache_lock);
}
}
@@ -5248,8 +5128,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
* pending move completes.
*/
list_insert_head(deadlist, sp);
- KMEM_STAT_ADD(kmem_move_stats.
- kms_endscan_slab_dead);
return (-1);
}
@@ -5264,10 +5142,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
cp->cache_slab_destroy++;
mutex_exit(&cp->cache_lock);
kmem_slab_destroy(cp, sp);
- KMEM_STAT_ADD(kmem_move_stats.
- kms_dead_slabs_freed);
- KMEM_STAT_ADD(kmem_move_stats.
- kms_endscan_slab_destroyed);
mutex_enter(&cp->cache_lock);
/*
* Since we can't pick up the scan where we left
@@ -5283,8 +5157,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
* for the request and say nothing about the
* number of reclaimable slabs.
*/
- KMEM_STAT_COND_ADD(s < max_slabs,
- kmem_move_stats.kms_endscan_nomem);
return (-1);
}
@@ -5300,16 +5172,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
* destination buffer on the same slab. In that
* case, we're not interested in counting it.
*/
- KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
- (s < max_slabs),
- kmem_move_stats.kms_endscan_refcnt_changed);
return (-1);
}
- if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
- KMEM_STAT_COND_ADD(s < max_slabs,
- kmem_move_stats.kms_endscan_nomove_changed);
+ if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
return (-1);
- }
/*
* Generating a move request allocates a destination
@@ -5336,11 +5202,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
}
end_scan:
- KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
- (s < max_slabs) &&
- (sp == avl_first(&cp->cache_partial_slabs)),
- kmem_move_stats.kms_endscan_freelist);
-
return (s);
}
@@ -5400,8 +5261,6 @@ kmem_cache_move_notify_task(void *arg)
&cp->cache_defrag->kmd_moves_pending)) {
list_insert_head(deadlist, sp);
mutex_exit(&cp->cache_lock);
- KMEM_STAT_ADD(kmem_move_stats.
- kms_notify_slab_dead);
return;
}
@@ -5409,9 +5268,6 @@ kmem_cache_move_notify_task(void *arg)
cp->cache_slab_destroy++;
mutex_exit(&cp->cache_lock);
kmem_slab_destroy(cp, sp);
- KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
- KMEM_STAT_ADD(kmem_move_stats.
- kms_notify_slab_destroyed);
return;
}
} else {
@@ -5425,7 +5281,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
{
kmem_move_notify_args_t *args;
- KMEM_STAT_ADD(kmem_move_stats.kms_notify);
args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
if (args != NULL) {
args->kmna_cache = cp;
@@ -5448,7 +5303,6 @@ kmem_cache_defrag(kmem_cache_t *cp)
n = avl_numnodes(&cp->cache_partial_slabs);
if (n > 1) {
/* kmem_move_buffers() drops and reacquires cache_lock */
- KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
cp->cache_defrag->kmd_defrags++;
(void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
}
@@ -5547,7 +5401,6 @@ kmem_cache_scan(kmem_cache_t *cp)
*
* kmem_move_buffers() drops and reacquires cache_lock.
*/
- KMEM_STAT_ADD(kmem_move_stats.kms_scans);
kmd->kmd_scans++;
slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
kmem_reclaim_max_slabs, 0);
@@ -5588,12 +5441,9 @@ kmem_cache_scan(kmem_cache_t *cp)
if (!kmem_move_noreap &&
((debug_rand % kmem_mtb_reap) == 0)) {
mutex_exit(&cp->cache_lock);
- KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
kmem_cache_reap(cp);
return;
} else if ((debug_rand % kmem_mtb_move) == 0) {
- KMEM_STAT_ADD(kmem_move_stats.kms_scans);
- KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
kmd->kmd_scans++;
(void) kmem_move_buffers(cp,
kmem_reclaim_scan_range, 1, KMM_DEBUG);
@@ -5604,8 +5454,6 @@ kmem_cache_scan(kmem_cache_t *cp)
mutex_exit(&cp->cache_lock);
- if (reap) {
- KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
+ if (reap)
kmem_depot_ws_reap(cp);
- }
}
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..227efe9190 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2013 Gary Mills
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
*/
#include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
*/
printf("\rSunOS Release %s Version %s %u-bit\n",
utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
- printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
- "All rights reserved.\n");
+ printf("Copyright (c) 2010-2017, Joyent Inc. All rights reserved.\n");
#ifdef DEBUG
printf("DEBUG enabled\n");
#endif
@@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc)
mblk_t *
log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg,
- size_t size, int on_intr)
+ size_t size, int on_intr)
{
mblk_t *mp = NULL;
mblk_t *mp2;
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index feb8e76c42..341e4ae356 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/param.h>
@@ -57,6 +57,8 @@
#include <sys/lgrp.h>
#include <sys/rctl.h>
#include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
#include <sys/cpc_impl.h>
#include <sys/sdt.h>
#include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
ret_tidhash_t *ret_tidhash = NULL;
int i;
int rctlfail = 0;
- boolean_t branded = 0;
+ void *brand_data = NULL;
struct ctxop *ctx = NULL;
ASSERT(cid != sysdccid); /* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
*/
lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
+ /*
+ * If necessary, speculatively allocate lwp brand data. This is done
+ * ahead of time so p_lock need not be dropped during lwp branding.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+ if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+ mutex_enter(&p->p_lock);
+ err = 1;
+ atomic_inc_32(&p->p_zone->zone_ffmisc);
+ goto error;
+ }
+ }
+
mutex_enter(&p->p_lock);
grow:
/*
@@ -630,18 +645,6 @@ grow:
} while (lwp_hash_lookup(p, t->t_tid) != NULL);
}
- /*
- * If this is a branded process, let the brand do any necessary lwp
- * initialization.
- */
- if (PROC_IS_BRANDED(p)) {
- if (BROP(p)->b_initlwp(lwp)) {
- err = 1;
- atomic_inc_32(&p->p_zone->zone_ffmisc);
- goto error;
- }
- branded = 1;
- }
if (t->t_tid == 1) {
kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
}
}
- p->p_lwpcnt++;
t->t_waitfor = -1;
/*
@@ -696,8 +698,27 @@ grow:
t->t_post_sys = 1;
/*
+ * Perform lwp branding
+ *
+ * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+ * continuously held between when the tidhash is sized and when the lwp
+ * is inserted into it. Operations requiring p->p_lock to be
+ * temporarily dropped can be performed in b_initlwp_post.
+ */
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_initlwp(lwp, brand_data);
+ /*
+ * The b_initlwp hook is expected to consume any preallocated
+ * brand_data in a way that prepares it for deallocation by the
+ * b_freelwp hook.
+ */
+ brand_data = NULL;
+ }
+
+ /*
* Insert the new thread into the list of all threads.
*/
+ p->p_lwpcnt++;
if ((tx = p->p_tlist) == NULL) {
t->t_back = t;
t->t_forw = t;
@@ -718,6 +739,15 @@ grow:
lep->le_start = t->t_start;
lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
+ /*
+ * Complete lwp branding
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+ BROP(p)->b_initlwp_post(lwp);
+ }
+
+ lwp_fp_init(lwp);
+
if (state == TS_RUN) {
/*
* We set the new lwp running immediately.
@@ -753,8 +783,9 @@ error:
if (cid != NOCLASS && bufp != NULL)
CL_FREE(cid, bufp);
- if (branded)
- BROP(p)->b_freelwp(lwp);
+ if (brand_data != NULL) {
+ BROP(p)->b_lwpdata_free(brand_data);
+ }
mutex_exit(&p->p_lock);
t->t_state = TS_FREE;
@@ -827,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
int i;
for (i = 0; i < ct_ntypes; i++) {
- dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+ ct_template_t *tmpl = src->lwp_ct_active[i];
+
+ /*
+ * If the process contract template is setup to be preserved
+ * across exec, then if we're forking, perform an implicit
+ * template_clear now. This ensures that future children of
+ * this child will remain in the same contract unless they're
+ * explicitly setup differently. We know we're forking if the
+ * two LWPs belong to different processes.
+ */
+ if (i == CTT_PROCESS && tmpl != NULL) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if (dst->lwp_procp != src->lwp_procp &&
+ (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ tmpl = NULL;
+ }
+
+ dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
dst->lwp_ct_latest[i] = NULL;
+
}
}
@@ -836,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
* Clear an LWP's contract template state.
*/
void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
{
ct_template_t *tmpl;
int i;
for (i = 0; i < ct_ntypes; i++) {
- if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
- ctmpl_free(tmpl);
- lwp->lwp_ct_active[i] = NULL;
- }
-
if (lwp->lwp_ct_latest[i] != NULL) {
contract_rele(lwp->lwp_ct_latest[i]);
lwp->lwp_ct_latest[i] = NULL;
}
+
+ if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+ /*
+ * If we're exec-ing a new program and the process
+ * contract template is setup to be preserved across
+ * exec, then don't clear it.
+ */
+ if (is_exec && i == CTT_PROCESS) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ continue;
+ }
+
+ ctmpl_free(tmpl);
+ lwp->lwp_ct_active[i] = NULL;
+ }
}
}
@@ -891,13 +953,6 @@ lwp_exit(void)
if (t->t_upimutex != NULL)
upimutex_cleanup();
- /*
- * Perform any brand specific exit processing, then release any
- * brand data associated with the lwp
- */
- if (PROC_IS_BRANDED(p))
- BROP(p)->b_lwpexit(lwp);
-
lwp_pcb_exit();
mutex_enter(&p->p_lock);
@@ -941,6 +996,18 @@ lwp_exit(void)
DTRACE_PROC(lwp__exit);
/*
+ * Perform any brand specific exit processing, then release any
+ * brand data associated with the lwp
+ */
+ if (PROC_IS_BRANDED(p)) {
+ mutex_exit(&p->p_lock);
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_freelwp(lwp);
+ mutex_enter(&p->p_lock);
+ prbarrier(p);
+ }
+
+ /*
* If the lwp is a detached lwp or if the process is exiting,
* remove (lwp_hash_out()) the lwp from the lwp directory.
* Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1101,7 +1168,7 @@ lwp_cleanup(void)
}
kpreempt_enable();
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
}
int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 7afc1cfe00..dda0b3e4a6 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -27,7 +27,7 @@
/* All Rights Reserved */
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args)
int error = 0, count = 0;
proc_t *p = ttoproc(curthread);
klwp_t *lwp = ttolwp(curthread);
- int brand_action;
+ int brand_action = EBA_NONE;
if (args == NULL)
args = "";
@@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args)
*/
sigemptyset(&curthread->t_hold);
- brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+ /*
+ * Only instruct exec_common to brand the process if necessary. It is
+ * possible that the init process is already properly branded due to the
+ * proc_exit -> restart_init -> exec_init call chain.
+ */
+ if (ZONE_IS_BRANDED(p->p_zone) &&
+ p->p_brand != p->p_zone->zone_brand) {
+ brand_action = EBA_BRAND;
+ }
again:
error = exec_common((const char *)(uintptr_t)exec_fnamep,
(const char **)(uintptr_t)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 142c10754e..0410e6f47b 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
}
if (num_segs++ == 0) {
/*
- * The p_vaddr of the first PT_LOAD segment
- * must either be NULL or within the first
- * page in order to be interpreted.
- * Otherwise, its an invalid file.
+ * While ELF doesn't specify the meaning of
+ * p_vaddr for PT_LOAD segments in ET_DYN
+ * objects, we mandate that is either NULL or
+ * (to accommodate some historical binaries)
+ * within the first page. (Note that there
+ * exist non-native ET_DYN objects that violate
+ * this constraint that we nonetheless must be
+ * able to execute; see the ET_DYN handling in
+ * mapelfexec() for details.)
*/
if (e_type == ET_DYN &&
((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
return (pidp);
}
+struct pid *
+pid_find(pid_t pid)
+{
+ struct pid *pidp;
+
+ mutex_enter(&pidlinklock);
+ pidp = pid_lookup(pid);
+ mutex_exit(&pidlinklock);
+
+ return (pidp);
+}
+
void
pid_setmin(void)
{
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
THREAD_KPRI_RELEASE();
}
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ THREAD_KPRI_RELEASE();
+}
+
void
pid_init(void)
{
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index d6821c83b0..8cc7f009a3 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -56,6 +56,7 @@
#include <sys/mntent.h>
#include <sys/contract_impl.h>
#include <sys/dld_ioc.h>
+#include <sys/brand.h>
/*
* There are two possible layers of privilege routines and two possible
@@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
void
secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
{
+ proc_t *p = curproc;
+
+ /*
+ * Allow the brand to override this behaviour.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+ /*
+ * This brand hook will return 0 if handling is complete, or
+ * some other value if the brand would like us to fall back to
+ * the usual behaviour.
+ */
+ if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+ return;
+ }
+ }
+
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(cr,
(vap->va_mode & S_ISUID) != 0 &&
@@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)
}
int
+secpolicy_fs_import(const cred_t *cr)
+{
+ return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
secpolicy_pfexec_register(const cred_t *cr)
{
return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)
return (secpolicy_net_config(cr, B_FALSE));
return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
}
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+ if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+ return (EPERM);
+ return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index bc1787c9ca..854fb602da 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
Allows a process to perform privileged mappings through a
graphics device.
+privilege PRIV_HYPRLOFS_CONTROL
+
+ Allows a process to manage hyprlofs entries.
+
privilege PRIV_IPC_DAC_READ
Allows a process to read a System V IPC
@@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES
Allows a process to open the real console device directly.
Allows a process to open devices that have been exclusively opened.
+privilege PRIV_SYS_FS_IMPORT
+
+ Allows a process to import a potentially untrusted file system.
+
privilege PRIV_SYS_IPC_CONFIG
Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 09b80323d5..e0a1126567 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/atomic.h>
@@ -194,6 +195,8 @@ id_space_t *rctl_ids;
kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */
kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */
+extern rctl_hndl_t rc_process_maxlockedmem;
+
kmutex_t rctl_lists_lock;
rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
@@ -2872,12 +2875,12 @@ rctl_init(void)
* rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
* int chargeproc)
*
- * Increments the amount of locked memory on a project, and
- * zone. If proj is non-NULL the project must be held by the
- * caller; if it is NULL the proj and zone of proc_t p are used.
- * If chargeproc is non-zero, then the charged amount is cached
- * on p->p_locked_mem so that the charge can be migrated when a
- * process changes projects.
+ * Increments the amount of locked memory on a process, project, and
+ * zone. If 'proj' is non-NULL, the project must be held by the
+ * caller; if it is NULL, the project and zone of process 'p' are used.
+ * If 'chargeproc' is non-zero, then the charged amount is added
+ * to p->p_locked_mem. This is also used so that the charge can be
+ * migrated when a process changes projects.
*
* Return values
* 0 - success
@@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
ASSERT(p != NULL);
ASSERT(MUTEX_HELD(&p->p_lock));
+
if (proj != NULL) {
projp = proj;
zonep = proj->kpj_zone;
@@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
}
}
- zonep->zone_locked_mem += inc;
- projp->kpj_data.kpd_locked_mem += inc;
if (chargeproc != 0) {
+ /* Check for overflow */
+ if ((p->p_locked_mem + inc) < p->p_locked_mem) {
+ ret = EAGAIN;
+ goto out;
+ }
+ if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p,
+ &e, inc, 0) & RCT_DENY) {
+ ret = EAGAIN;
+ goto out;
+ }
+
p->p_locked_mem += inc;
}
+
+ zonep->zone_locked_mem += inc;
+ projp->kpj_data.kpd_locked_mem += inc;
out:
mutex_exit(&zonep->zone_mem_lock);
return (ret);
diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c
index 9b7324fe7b..c62540d2b4 100644
--- a/usr/src/uts/common/os/rctl_proc.c
+++ b/usr/src/uts/common/os/rctl_proc.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -32,6 +33,7 @@
#include <sys/port_kernel.h>
#include <sys/signal.h>
#include <sys/var.h>
+#include <sys/policy.h>
#include <sys/vmparam.h>
#include <sys/machparam.h>
@@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;
rctl_hndl_t rc_process_semopm;
rctl_hndl_t rc_process_portev;
rctl_hndl_t rc_process_sigqueue;
+rctl_hndl_t rc_process_maxlockedmem;
/*
* process.max-cpu-time / RLIMIT_CPU
@@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {
};
/*
+ * process.max-locked-memory
+ */
+/*ARGSUSED*/
+static int
+proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
+ struct rctl_val *rv, rctl_qty_t i, uint_t f)
+{
+ if (secpolicy_lock_memory(CRED()) == 0)
+ return (0);
+ return ((p->p_locked_mem + i) > rv->rcv_value);
+}
+
+static rctl_ops_t proc_maxlockedmem_ops = {
+ rcop_no_action,
+ rcop_no_usage,
+ rcop_no_set,
+ proc_maxlockedmem_test
+};
+
+/*
* void rctlproc_default_init()
*
* Overview
@@ -383,6 +406,11 @@ rctlproc_init(void)
rctl_add_default_limit("process.max-sigqueue-size",
_SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
+ rc_process_maxlockedmem = rctl_register("process.max-locked-memory",
+ RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS |
+ RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES,
+ ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops);
+
/*
* Place minimal set of controls on "sched" process for inheritance by
* processes created via newproc().
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
klwp_t *lwp = ttolwp(tp);
/*
- * Swapout eligible lwps (specified by the scheduling
- * class) which don't have TS_DONT_SWAP set. Set the
- * "intent to swap" flag (TS_SWAPENQ) on threads
- * which have TS_DONT_SWAP set so that they can be
+ * Swapout eligible lwps (specified by the scheduling class)
+ * which don't have TS_DONT_SWAP set. Set the "intent to swap"
+ * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+ * set or are currently on a split stack so that they can be
* swapped if and when they reach a safe point.
*/
thread_lock(tp);
thread_pri = CL_SWAPOUT(tp, swapflags);
if (thread_pri != -1) {
- if (tp->t_schedflag & TS_DONT_SWAP) {
+ if ((tp->t_schedflag & TS_DONT_SWAP) ||
+ (tp->t_flag & T_SPLITSTK)) {
tp->t_schedflag |= TS_SWAPENQ;
tp->t_trapret = 1;
aston(tp);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 5721083751..18b396a765 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)
/*
- * If the sc_sigblock field is set for the specified thread, set
- * its signal mask to block all maskable signals, then clear the
- * sc_sigblock field. This finishes what user-level code requested
- * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
- * Called from signal-related code either by the current thread for
- * itself or by a thread that holds the process's p_lock (/proc code).
+ * If the sc_sigblock field is set for the specified thread, set its signal
+ * mask to block all maskable signals, then clear the sc_sigblock field. This
+ * accomplishes what user-level code requested to be done when it set
+ * tdp->sc_shared->sc_sigblock non-zero.
+ *
+ * This is generally called by signal-related code in the current thread. In
+ * order to call against a thread other than curthread, p_lock for the
+ * containing process must be held. Even then, the caller is not protected
+ * from races with the thread in question updating its own fields. It is the
+ * responsibility of the caller to perform additional synchronization.
+ *
*/
void
schedctl_finish_sigblock(kthread_t *t)
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index bacc595f78..5deae96d73 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
size_t share_size;
struct shm_data ssd;
uintptr_t align_hint;
+ long curprot;
/*
* Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
}
+ curprot = sp->shm_opts & SHM_PROT_MASK;
if (!isspt(sp)) {
error = sptcreate(size, &segspt, sp->shm_amp, prot,
flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
sp->shm_sptinfo->sptas = segspt->s_as;
sp->shm_sptseg = segspt;
- sp->shm_sptprot = prot;
- } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+ sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+ } else if ((prot & curprot) != curprot) {
/*
* Ensure we're attaching to an ISM segment with
* fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
}
break;
+ /* Stage segment for removal, but don't remove until last detach */
+ case SHM_RMID:
+ if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+ break;
+
+ /*
+ * If attached, just mark it as a pending remove, otherwise
+ * we must perform the normal ipc_rmid now.
+ */
+ if ((sp->shm_perm.ipc_ref - 1) > 0) {
+ sp->shm_opts |= SHM_RM_PENDING;
+ } else {
+ mutex_exit(lock);
+ return (ipc_rmid(shm_svc, shmid, cr));
+ }
+ break;
+
default:
error = EINVAL;
break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
sp->shm_ismattch--;
sp->shm_dtime = gethrestime_sec();
sp->shm_lpid = pp->p_pid;
+ if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+ sp->shm_perm.ipc_ref == 2) {
+ /*
+ * If this is the last detach of the segment across the whole
+ * system then now we can perform the delayed IPC_RMID.
+ * The ipc_ref count has 1 for the original 'get' and one for
+ * each 'attach' (see 'stat' handling in shmctl).
+ */
+ sp->shm_opts &= ~SHM_RM_PENDING;
+ mutex_enter(&shm_svc->ipcs_lock);
+ ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
+ ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+ ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+ /* Lock was dropped, need to retake it for following rele. */
+ (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+ }
ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..5ef12f3ae4 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -60,6 +60,7 @@
#include <sys/cyclic.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <sys/signalfd.h>
const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
}
/*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+ return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */
+ !(PROC_IS_BRANDED(p) && /* allowed by brand */
+ BROP(p)->b_sig_ignorable != NULL &&
+ BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
* Return true if the signal can safely be discarded on generation.
* That is, if there is no need for the signal on the receiving end.
* The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
* the signal is not being accepted via sigwait()
*/
static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
{
kthread_t *t = p->p_tlist;
+ klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
return (t == NULL || /* if zombie or ... */
- (sigismember(&p->p_ignore, sig) && /* signal is ignored */
+ (sig_ignorable(p, lwp, sig) && /* signal is ignored */
t->t_forw == t && /* and single-threaded */
!tracing(p, sig) && /* and no /proc tracing */
!signal_is_blocked(t, sig) && /* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
!(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
ttoproc(t)->p_stopsig = 0;
t->t_dtrace_stop = 0;
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
} else if (t != curthread && t->t_state == TS_ONPROC) {
aston(t); /* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
}
}
- if (sig_discardable(p, sig)) {
+ if (sig_discardable(p, t, sig)) {
DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
proc_t *, p, int, sig);
return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
if (sigismember(&set, sig) &&
(tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig))) {
+ !sig_ignorable(p, lwp, sig))) {
/*
* Don't promote a signal that will stop
* the process when lwp_nostop is set.
@@ -623,6 +640,21 @@ issig_forreal(void)
}
/*
+ * Allow the brand the chance to alter (or suppress) delivery
+ * of this signal.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+ /*
+ * The brand hook will return 0 if it would like
+ * us to drive on, or -1 if we should restart
+ * the loop to check other conditions.
+ */
+ if (BROP(p)->b_issig_stop(p, lwp) != 0) {
+ continue;
+ }
+ }
+
+ /*
* Honor requested stop before dealing with the
* current signal; a debugger may change it.
* Do not want to go back to loop here since this is a special
@@ -656,7 +688,7 @@ issig_forreal(void)
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
if (sigismember(&t->t_sigwait, sig) ||
- (!sigismember(&p->p_ignore, sig) &&
+ (!sig_ignorable(p, lwp, sig) &&
!isjobstop(sig))) {
if (p->p_flag & (SEXITLWPS|SKILLED)) {
sig = SIGKILL;
@@ -708,7 +740,7 @@ issig_forreal(void)
toproc = 0;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&t->t_extsig, sig))
ext = 1;
break;
@@ -722,7 +754,7 @@ issig_forreal(void)
toproc = 1;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&p->p_extsig, sig))
ext = 1;
break;
@@ -954,6 +986,16 @@ stop(int why, int what)
}
break;
+ case PR_BRAND:
+ /*
+ * We have been stopped by the brand code for a brand-private
+ * reason. This is an asynchronous stop affecting only this
+ * LWP.
+ */
+ VERIFY(PROC_IS_BRANDED(p));
+ flags &= ~TS_BSTART;
+ break;
+
default: /* /proc stop */
flags &= ~TS_PSTART;
/*
@@ -1065,7 +1107,7 @@ stop(int why, int what)
}
}
- if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+ if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
/*
* Do process-level notification when all lwps are
* either stopped on events of interest to /proc
@@ -1171,6 +1213,13 @@ stop(int why, int what)
if (why == PR_CHECKPOINT)
del_one_utstop();
+ /*
+ * Allow the brand to post notification of this stop condition.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+ BROP(p)->b_stop_notify(p, lwp, why, what);
+ }
+
thread_lock(t);
ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
t->t_schedflag |= flags;
@@ -1192,7 +1241,7 @@ stop(int why, int what)
(p->p_flag & (SEXITLWPS|SKILLED))) {
p->p_stopsig = 0;
thread_lock(t);
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
thread_unlock_nopreempt(t);
} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1376,7 @@ psig(void)
* this signal from pending to current (we dropped p->p_lock).
* This can happen only in a multi-threaded process.
*/
- if (sigismember(&p->p_ignore, sig) ||
+ if (sig_ignorable(p, lwp, sig) ||
(func == SIG_DFL && sigismember(&stopdefault, sig))) {
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
@@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
/*
* This can only happen when the parent is init.
* (See call to sigcld(q, NULL) in exit().)
- * Use KM_NOSLEEP to avoid deadlock.
+ * Use KM_NOSLEEP to avoid deadlock. The child procs
+ * initpid can be 1 for zlogin.
*/
- ASSERT(pp == proc_init);
+ ASSERT(pp->p_pidp->pid_id ==
+ cp->p_zone->zone_proc_initpid ||
+ pp->p_pidp->pid_id == 1);
winfo(cp, &info, 0);
sigaddq(pp, NULL, &info, KM_NOSLEEP);
} else {
@@ -1804,6 +1856,15 @@ sigcld_repost()
sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
mutex_enter(&pidlock);
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+ /*
+ * Allow the brand to inject synthetic SIGCLD signals.
+ */
+ if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+ mutex_exit(&pidlock);
+ return;
+ }
+ }
for (cp = pp->p_child; cp; cp = cp->p_sibling) {
if (cp->p_pidflag & CLDPEND) {
post_sigcld(cp, sqp);
@@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(sig >= 1 && sig < NSIG);
- if (sig_discardable(p, sig))
+ if (sig_discardable(p, t, sig))
siginfofree(sigqp);
else
sigaddqins(p, t, sigqp);
@@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
* blocking the signal (it *could* change it's mind while
* the signal is pending) then don't bother creating one.
*/
- if (!sig_discardable(p, sig) &&
+ if (!sig_discardable(p, t, sig) &&
(sigismember(&p->p_siginfo, sig) ||
(curproc->p_ct_process != p->p_ct_process) ||
(sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
#include <sys/smbios_impl.h>
#include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
void *
smb_alloc(size_t len)
{
- return (kmem_alloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
}
void *
smb_zalloc(size_t len)
{
- return (kmem_zalloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
}
void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 62f94729cf..21ec25b5b3 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -24,7 +24,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -77,6 +77,7 @@
#include <sys/policy.h>
#include <sys/dld.h>
#include <sys/zone.h>
+#include <sys/limits.h>
#include <c2/audit.h>
/*
@@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
* (registered in sd_wakeq).
*/
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
if (first)
stp->sd_wakeq &= ~RSLEEP;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_mp = 0;
/*
* Mark that a thread is in rwnext on the read side
@@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
if ((bp = uiod.d_mp) != NULL) {
*errorp = 0;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (bp);
}
error = 0;
@@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
} else {
*errorp = error;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (NULL);
}
+
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
+
/*
* Try a getq in case a rwnext() generated mblk
* has bubbled up via strrput().
@@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
int b_flag, int pri, int flags)
{
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
mblk_t *mp;
queue_t *wqp = stp->sd_wrq;
int error = 0;
@@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
mp->b_flag |= b_flag;
mp->b_band = (uchar_t)pri;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_uio.uio_offset = 0;
uiod.d_mp = mp;
error = rwnext(wqp, &uiod);
if (! uiod.d_mp) {
uioskip(uiop, *iosize);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
ASSERT(mp == uiod.d_mp);
@@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
error = 0;
} else {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
/* Have to check canput before consuming data from the uio */
if (pri == 0) {
if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
} else {
if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
}
@@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
/* Copyin data from the uio */
if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
uioskip(uiop, *iosize);
@@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
putnext(wqp, mp);
stream_runservice(stp);
}
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (0);
}
@@ -3178,6 +3215,7 @@ job_control_type(int cmd)
case JAGENT: /* Obsolete */
case JTRUN: /* Obsolete */
case JXTPROTO: /* Obsolete */
+ case TIOCSETLD:
return (JCSETP);
}
@@ -8174,12 +8212,8 @@ out:
* an M_PROTO/M_PCPROTO part).
*/
int
-strpoll(
- struct stdata *stp,
- short events_arg,
- int anyyet,
- short *reventsp,
- struct pollhead **phpp)
+strpoll(struct stdata *stp, short events_arg, int anyyet, short *reventsp,
+ struct pollhead **phpp)
{
int events = (ushort_t)events_arg;
int retevents = 0;
@@ -8316,8 +8350,7 @@ chkrd:
retevents |= (events & (POLLIN | POLLRDBAND));
break;
}
- if (! (retevents & normevents) &&
- (stp->sd_wakeq & RSLEEP)) {
+ if (!(retevents & normevents) && (stp->sd_wakeq & RSLEEP)) {
/*
* Sync stream barrier read queue has data.
*/
@@ -8328,19 +8361,11 @@ chkrd:
retevents |= normevents;
}
- *reventsp = (short)retevents;
- if (retevents && !(events & POLLET)) {
- if (headlocked)
- mutex_exit(&stp->sd_lock);
- return (0);
- }
-
/*
- * If poll() has not found any events yet, set up event cell
- * to wake up the poll if a requested event occurs on this
- * stream. Check for collisions with outstanding poll requests.
+ * Pass back a pollhead if no events are pending or if edge-triggering
+ * has been configured on this resource.
*/
- if (!anyyet) {
+ if ((retevents == 0 && !anyyet) || (events & POLLET)) {
*phpp = &stp->sd_pollist;
if (headlocked == 0) {
if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) {
@@ -8351,6 +8376,8 @@ chkrd:
}
stp->sd_rput_opt |= SR_POLLIN;
}
+
+ *reventsp = (short)retevents;
if (headlocked)
mutex_exit(&stp->sd_lock);
return (0);
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index ede7da413b..b1727729de 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -5903,6 +5903,12 @@ ddi_ffs(long mask)
return (ffs(mask));
}
+int
+ddi_ffsll(long long mask)
+{
+ return (ffs(mask));
+}
+
/*
* Find last bit set. Take mask and clear
* all but the most significant bit, and
@@ -5914,8 +5920,14 @@ ddi_ffs(long mask)
int
ddi_fls(long mask)
{
+ return (ddi_flsll(mask));
+}
+
+int
+ddi_flsll(long long mask)
+{
while (mask) {
- long nx;
+ long long nx;
if ((nx = (mask & (mask - 1))) == 0)
break;
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index b3861dec03..4dd48c7e19 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -23,7 +23,7 @@
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -61,8 +61,7 @@ struct mmaplf32a;
int access(char *, int);
int alarm(int);
int auditsys(struct auditcalls *, rval_t *);
-int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t);
+int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
intptr_t brk(caddr_t);
int chdir(char *);
int chmod(char *, int);
@@ -647,7 +646,7 @@ struct sysent sysent[NSYSCALL] =
SYSENT_NOSYS(),
SYSENT_C("llseek", llseek32, 4)),
/* 176 */ SYSENT_LOADABLE(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE(), /* kaio */
/* 179 */ SYSENT_LOADABLE(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1002,7 +1001,7 @@ struct sysent sysent32[NSYSCALL] =
/* 174 */ SYSENT_CI("pwrite", pwrite32, 4),
/* 175 */ SYSENT_C("llseek", llseek32, 4),
/* 176 */ SYSENT_LOADABLE32(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE32(), /* kaio */
/* 179 */ SYSENT_LOADABLE32(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1094,18 +1093,20 @@ char **syscallnames;
systrace_sysent_t *systrace_sysent;
void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
/*ARGSUSED*/
void
systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
- uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+ uintptr_t arg6, uintptr_t arg7)
{}
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
dtrace_id_t id;
@@ -1113,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1127,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1146,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32;
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
dtrace_id_t id;
@@ -1154,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1168,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1203,5 +1209,5 @@ dtrace_systrace_rtt(void)
}
if ((id = sy->stsy_return) != DTRACE_IDNONE)
- (*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+ (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
}
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index b25a6cbcf1..5453ebf380 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,11 +25,12 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/timer.h>
#include <sys/systm.h>
+#include <sys/sysmacros.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/debug.h>
@@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)
* waiters. p_lock must be held on entry; it will not be dropped by
* timer_unlock().
*/
+/* ARGSUSED */
static void
timer_unlock(proc_t *p, itimer_t *it)
{
@@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
timer_lock(p, it);
}
+ ASSERT(p->p_itimer_sz > tid);
ASSERT(p->p_itimer[tid] == it);
p->p_itimer[tid] = NULL;
@@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
it->it_backend->clk_timer_delete(it);
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portev) {
port_kevent_t *pev;
@@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
static itimer_t *
timer_grab(proc_t *p, timer_t tid)
{
- itimer_t **itp, *it;
+ itimer_t *it;
- if (tid >= timer_max || tid < 0)
+ if (tid < 0) {
return (NULL);
+ }
mutex_enter(&p->p_lock);
-
- if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) {
+ if (p->p_itimer == NULL || tid >= p->p_itimer_sz ||
+ (it = p->p_itimer[tid]) == NULL) {
mutex_exit(&p->p_lock);
return (NULL);
}
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (it->it_lock & ITLK_REMOVE) {
@@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
* should not be held on entry; timer_release() will acquire p_lock but
* will drop it before returning.
*/
-static void
+void
timer_release(proc_t *p, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
* p_lock should not be held on entry; timer_delete_grabbed() will acquire
* p_lock, but will drop it before returning.
*/
-static void
+void
timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -258,6 +263,13 @@ clock_timer_init()
{
clock_timer_cache = kmem_cache_create("timer_cache",
sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * Push the timer_max limit up to at least 4 * NCPU. Due to the way
+ * NCPU is defined, proper initialization of the timer limit is
+ * performed at runtime.
+ */
+ timer_max = MAX(NCPU * 4, timer_max);
}
void
@@ -453,6 +465,9 @@ timer_fire(itimer_t *it)
it->it_pending = 1;
port_send_event((port_kevent_t *)it->it_portev);
mutex_exit(&it->it_mutex);
+ } else if (it->it_flags & IT_CALLBACK) {
+ it->it_cb_func(it);
+ ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
} else if (it->it_flags & IT_SIGNAL) {
it->it_pending = 1;
mutex_exit(&it->it_mutex);
@@ -466,159 +481,175 @@ timer_fire(itimer_t *it)
mutex_exit(&p->p_lock);
}
-int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+/*
+ * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
+ * Acquires p_lock and holds it on return, regardless of success.
+ */
+static itimer_t *
+timer_alloc(proc_t *p, timer_t *id)
{
- struct sigevent ev;
- proc_t *p = curproc;
- clock_backend_t *backend;
- itimer_t *it, **itp;
- sigqueue_t *sigq;
- cred_t *cr = CRED();
- int error = 0;
- timer_t i;
- port_notify_t tim_pnevp;
- port_kevent_t *pkevp = NULL;
+ itimer_t *it, **itp = NULL;
+ uint_t i;
- if ((backend = CLOCK_BACKEND(clock)) == NULL)
- return (set_errno(EINVAL));
+ ASSERT(MUTEX_NOT_HELD(&p->p_lock));
- if (evp != NULL) {
- /*
- * short copyin() for binary compatibility
- * fetch oldsigevent to determine how much to copy in.
- */
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- if (copyin(evp, &ev, sizeof (struct oldsigevent)))
- return (set_errno(EFAULT));
+ it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+ bzero(it, sizeof (itimer_t));
+ mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
- sizeof (port_notify_t)))
- return (set_errno(EFAULT));
+ mutex_enter(&p->p_lock);
+retry:
+ if (p->p_itimer != NULL) {
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if (p->p_itimer[i] == NULL) {
+ itp = &(p->p_itimer[i]);
+ break;
}
-#ifdef _SYSCALL32_IMPL
- } else {
- struct sigevent32 ev32;
- port_notify32_t tim_pnevp32;
+ }
+ }
- if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
- return (set_errno(EFAULT));
- ev.sigev_notify = ev32.sigev_notify;
- ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * A suitable slot was not found. If possible, allocate (or resize)
+ * the p_itimer array and try again.
+ */
+ if (itp == NULL) {
+ uint_t target_sz = _TIMER_ALLOC_INIT;
+ itimer_t **itp_new;
+
+ if (p->p_itimer != NULL) {
+ ASSERT(p->p_itimer_sz != 0);
+
+ target_sz = p->p_itimer_sz * 2;
+ }
+ /*
+ * Protect against exceeding the max or overflow
+ */
+ if (target_sz > timer_max || target_sz > INT_MAX ||
+ target_sz < p->p_itimer_sz) {
+ kmem_cache_free(clock_timer_cache, it);
+ return (NULL);
+ }
+ mutex_exit(&p->p_lock);
+ itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
+ KM_SLEEP);
+ mutex_enter(&p->p_lock);
+ if (target_sz <= p->p_itimer_sz) {
/*
- * See comment in sigqueue32() on handling of 32-bit
- * sigvals in a 64-bit kernel.
+ * A racing thread performed the resize while we were
+ * waiting outside p_lock. Discard our now-useless
+ * allocation and retry.
*/
- ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin((void *)(uintptr_t)
- ev32.sigev_value.sival_ptr,
- (void *)&tim_pnevp32,
- sizeof (port_notify32_t)))
- return (set_errno(EFAULT));
- tim_pnevp.portnfy_port =
- tim_pnevp32.portnfy_port;
- tim_pnevp.portnfy_user =
- (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+ goto retry;
+ } else {
+ /*
+ * Instantiate the larger allocation and select the
+ * first fresh entry for use.
+ */
+ if (p->p_itimer != NULL) {
+ uint_t old_sz;
+
+ old_sz = p->p_itimer_sz;
+ bcopy(p->p_itimer, itp_new,
+ old_sz * sizeof (itimer_t *));
+ kmem_free(p->p_itimer,
+ old_sz * sizeof (itimer_t *));
+
+ /*
+ * Short circuit to use the first free entry in
+ * the new allocation. It's possible that
+ * other lower-indexed timers were freed while
+ * p_lock was dropped, but skipping over them
+ * is not harmful at all. In the common case,
+ * we skip the need to walk over an array
+ * filled with timers before arriving at the
+ * slot we know is fresh from the allocation.
+ */
+ i = old_sz;
+ } else {
+ /*
+ * For processes lacking any existing timers,
+ * we can simply select the first entry.
+ */
+ i = 0;
}
-#endif
+ p->p_itimer = itp_new;
+ p->p_itimer_sz = target_sz;
}
- switch (ev.sigev_notify) {
- case SIGEV_NONE:
- break;
- case SIGEV_SIGNAL:
- if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
- return (set_errno(EINVAL));
- break;
- case SIGEV_THREAD:
- case SIGEV_PORT:
- break;
- default:
- return (set_errno(EINVAL));
- }
- } else {
- /*
- * Use the clock's default sigevent (this is a structure copy).
- */
- ev = backend->clk_default;
}
+ ASSERT(i <= INT_MAX);
+ *id = (timer_t)i;
+ return (it);
+}
+
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend. Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete(). This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
+int
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+ itimer_t **itp, timer_t *tidp)
+{
+ proc_t *p = curproc;
+ int error = 0;
+ itimer_t *it;
+ sigqueue_t *sigq;
+ timer_t tid;
+
/*
- * We'll allocate our timer and sigqueue now, before we grab p_lock.
- * If we can't find an empty slot, we'll free them before returning.
+ * We'll allocate our sigqueue now, before we grab p_lock.
+ * If we can't find an empty slot, we'll free it before returning.
*/
- it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
- bzero(it, sizeof (itimer_t));
- mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
/*
- * If this is this process' first timer, we need to attempt to allocate
- * an array of timerstr_t pointers. We drop p_lock to perform the
- * allocation; if we return to discover that p_itimer is non-NULL,
- * we will free our allocation and drive on.
+ * Allocate a timer and choose a slot for it. This acquires p_lock.
*/
- if ((itp = p->p_itimer) == NULL) {
- mutex_exit(&p->p_lock);
- itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
- if (p->p_itimer == NULL)
- p->p_itimer = itp;
- else {
- kmem_free(itp, timer_max * sizeof (itimer_t *));
- itp = p->p_itimer;
- }
- }
-
- for (i = 0; i < timer_max && itp[i] != NULL; i++)
- continue;
+ it = timer_alloc(p, &tid);
+ ASSERT(MUTEX_HELD(&p->p_lock));
- if (i == timer_max) {
- /*
- * We couldn't find a slot. Drop p_lock, free the preallocated
- * timer and sigqueue, and return an error.
- */
+ if (it == NULL) {
mutex_exit(&p->p_lock);
- kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
-
- return (set_errno(EAGAIN));
+ return (EAGAIN);
}
- ASSERT(i < timer_max && itp[i] == NULL);
+ ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
+ ASSERT(evp != NULL);
/*
* If we develop other notification mechanisms, this will need
* to call into (yet another) backend.
*/
- sigq->sq_info.si_signo = ev.sigev_signo;
- if (evp == NULL)
- sigq->sq_info.si_value.sival_int = i;
- else
- sigq->sq_info.si_value = ev.sigev_value;
+ sigq->sq_info.si_signo = evp->sigev_signo;
+ sigq->sq_info.si_value = evp->sigev_value;
sigq->sq_info.si_code = SI_TIMER;
sigq->sq_info.si_pid = p->p_pid;
sigq->sq_info.si_ctid = PRCTID(p);
sigq->sq_info.si_zoneid = getzoneid();
- sigq->sq_info.si_uid = crgetruid(cr);
+ sigq->sq_info.si_uid = crgetruid(CRED());
sigq->sq_func = timer_signal;
sigq->sq_next = NULL;
sigq->sq_backptr = it;
it->it_sigq = sigq;
it->it_backend = backend;
it->it_lock = ITLK_LOCKED;
- itp[i] = it;
-
- if (ev.sigev_notify == SIGEV_THREAD ||
- ev.sigev_notify == SIGEV_PORT) {
+ if (evp->sigev_notify == SIGEV_THREAD ||
+ evp->sigev_notify == SIGEV_PORT) {
int port;
+ port_kevent_t *pkevp = NULL;
+
+ ASSERT(pnp != NULL);
/*
* This timer is programmed to use event port notification when
@@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
*/
it->it_flags |= IT_PORT;
- port = tim_pnevp.portnfy_port;
+ port = pnp->portnfy_port;
/* associate timer as event source with the port */
error = port_associate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t **)&it->it_portsrc, timer_close_port,
(void *)it, NULL);
if (error) {
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* allocate an event structure/slot */
@@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
if (error) {
(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t *)it->it_portsrc);
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* initialize event data */
- port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+ port_init_event(pkevp, tid, pnp->portnfy_user,
timer_port_callback, it);
it->it_portev = pkevp;
it->it_portfd = port;
} else {
- if (ev.sigev_notify == SIGEV_SIGNAL)
+ if (evp->sigev_notify == SIGEV_SIGNAL)
it->it_flags |= IT_SIGNAL;
}
+ /* Populate the slot now that the timer is prepped. */
+ p->p_itimer[tid] = it;
mutex_exit(&p->p_lock);
/*
@@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
it->it_lwp = ttolwp(curthread);
it->it_proc = p;
- if (copyout(&i, tid, sizeof (timer_t)) != 0) {
- error = EFAULT;
- goto err;
- }
-
- /*
- * If we're here, then we have successfully created the timer; we
- * just need to release the timer and return.
- */
- timer_release(p, it);
-
+ *itp = it;
+ *tidp = tid;
return (0);
err:
@@ -708,11 +730,115 @@ err:
* impossible for a removal to be pending.
*/
ASSERT(!(it->it_lock & ITLK_REMOVE));
- timer_delete_grabbed(p, i, it);
+ timer_delete_grabbed(p, tid, it);
- return (set_errno(error));
+ return (error);
}
+
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+ int error = 0;
+ proc_t *p = curproc;
+ clock_backend_t *backend;
+ struct sigevent ev;
+ itimer_t *it;
+ timer_t tid;
+ port_notify_t tim_pnevp;
+
+ if ((backend = CLOCK_BACKEND(clock)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (evp != NULL) {
+ /*
+ * short copyin() for binary compatibility
+ * fetch oldsigevent to determine how much to copy in.
+ */
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+ return (set_errno(EFAULT));
+
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+ sizeof (port_notify_t)))
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ } else {
+ struct sigevent32 ev32;
+ port_notify32_t tim_pnevp32;
+
+ if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+ return (set_errno(EFAULT));
+ ev.sigev_notify = ev32.sigev_notify;
+ ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * See comment in sigqueue32() on handling of 32-bit
+ * sigvals in a 64-bit kernel.
+ */
+ ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin((void *)(uintptr_t)
+ ev32.sigev_value.sival_ptr,
+ (void *)&tim_pnevp32,
+ sizeof (port_notify32_t)))
+ return (set_errno(EFAULT));
+ tim_pnevp.portnfy_port =
+ tim_pnevp32.portnfy_port;
+ tim_pnevp.portnfy_user =
+ (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ }
+#endif
+ }
+ switch (ev.sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+ return (set_errno(EINVAL));
+ break;
+ case SIGEV_THREAD:
+ case SIGEV_PORT:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ } else {
+ /*
+ * Use the clock's default sigevent (this is a structure copy).
+ */
+ ev = backend->clk_default;
+ }
+
+ if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * Populate si_value with the timer ID if no sigevent was passed in.
+ */
+ if (evp == NULL) {
+ it->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ timer_delete_grabbed(p, tid, it);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * If we're here, then we have successfully created the timer; we
+ * just need to release the timer and return.
+ */
+ timer_release(p, it);
+
+ return (0);
+}
+
+
int
timer_gettime(timer_t tid, itimerspec_t *val)
{
@@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)
void
timer_lwpexit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL) {
continue;
+ }
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) {
@@ -876,20 +1005,22 @@ timer_lwpexit(void)
void
timer_lwpbind()
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL)
continue;
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) {
@@ -911,16 +1042,19 @@ timer_lwpbind()
void
timer_exit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
ASSERT(p->p_itimer != NULL);
+ ASSERT(p->p_itimer_sz != 0);
- for (i = 0; i < timer_max; i++)
- (void) timer_delete(i);
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ (void) timer_delete((timer_t)i);
+ }
- kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *));
+ kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
p->p_itimer = NULL;
+ p->p_itimer_sz = 0;
}
/*
@@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
for (tid = 0; tid < timer_max; tid++) {
if ((it = timer_grab(p, tid)) == NULL)
continue;
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portfd == port) {
port_kevent_t *pev;
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 61acc6cf97..53be806026 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)
void
hrt2ts(hrtime_t hrt, timestruc_t *tsp)
{
+#if defined(__amd64)
+ /*
+ * The cleverness explained above is unecessary on x86_64 CPUs where
+ * modern compilers are able to optimize down to faster operations.
+ */
+ tsp->tv_sec = hrt / NANOSEC;
+ tsp->tv_nsec = hrt % NANOSEC;
+#else
uint32_t sec, nsec, tmp;
tmp = (uint32_t)(hrt >> 30);
@@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
}
tsp->tv_sec = (time_t)sec;
tsp->tv_nsec = nsec;
+#endif /* defined(__amd64) */
}
/*
* Convert from timestruc_t to hrtime_t.
- *
- * The code below is equivalent to:
- *
- * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
- *
- * but requires no integer multiply.
*/
hrtime_t
ts2hrt(const timestruc_t *tsp)
{
+#if defined(__amd64) || defined(__i386)
+ /*
+ * On modern x86 CPUs, the simple version is faster.
+ */
+ return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
+#else
+ /*
+ * The code below is equivalent to:
+ *
+ * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
+ *
+ * but requires no integer multiply.
+ */
hrtime_t hrt;
hrt = tsp->tv_sec;
@@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
hrt = (hrt << 7) - hrt - hrt - hrt;
hrt = (hrt << 9) + tsp->tv_nsec;
return (hrt);
+#endif /* defined(__amd64) || defined(__i386) */
}
/*
@@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)
void
hrt2tv(hrtime_t hrt, struct timeval *tvp)
{
+#if defined(__amd64)
+ /*
+ * Like hrt2ts, the simple version is faster on x86_64.
+ */
+ tvp->tv_sec = hrt / NANOSEC;
+ tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC);
+#else
uint32_t sec, nsec, tmp;
uint32_t q, r, t;
@@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)
sec++;
}
tvp->tv_sec = (time_t)sec;
-/*
- * this routine is very similar to hr2ts, but requires microseconds
- * instead of nanoseconds, so an interger divide by 1000 routine
- * completes the conversion
- */
+ /*
+ * this routine is very similar to hr2ts, but requires microseconds
+ * instead of nanoseconds, so an interger divide by 1000 routine
+ * completes the conversion
+ */
t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
q = q >> 9;
r = nsec - q*1000;
tvp->tv_usec = q + ((r + 24) >> 10);
-
+#endif /* defined(__amd64) */
}
int
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index b79b1b4cf7..e3da4df247 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1621,7 +1621,7 @@ vmem_destroy(vmem_t *vmp)
leaked = vmem_size(vmp, VMEM_ALLOC);
if (leaked != 0)
- cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+ cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
"identifiers" : "bytes");
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 5a5dc7d107..a8993524ac 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -251,6 +251,8 @@
#include <sys/cpucaps.h>
#include <vm/seg.h>
#include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
/*
* This constant specifies the number of seconds that threads waiting for
@@ -371,8 +373,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -418,8 +424,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1378,6 +1385,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zone->zone_zfs_io_pri = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1704,6 +1819,39 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ /* No additional lock because not enforced in the kernel */
+ q = z->zone_phys_mem;
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_phys_mem_ctl = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1797,6 +1945,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+ zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1825,7 +1987,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1850,6 +2012,160 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the I/O throttle
+ * counters are updated directly by the ZFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
static int
zone_mcap_kstat_update(kstat_t *ksp, int rw)
@@ -1860,11 +2176,19 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
if (rw == KSTAT_WRITE)
return (EACCES);
+ zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+ zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+ zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+ zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+ zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
return (0);
}
@@ -1892,12 +2216,22 @@ zone_mcap_kstat_create(zone_t *zone)
/* The kstat "name" field is not large enough for a full zonename */
kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+ KSTAT_DATA_UINT64);
ksp->ks_update = zone_mcap_kstat_update;
ksp->ks_private = zone;
@@ -1935,6 +2269,8 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
+ zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
@@ -1978,6 +2314,8 @@ zone_misc_kstat_create(zone_t *zone)
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
@@ -1993,13 +2331,25 @@ zone_misc_kstat_create(zone_t *zone)
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
zone->zone_mcap_stats = kmem_zalloc(
sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2031,8 +2381,15 @@ zone_kstat_delete(zone_t *zone)
sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_swapresv_kstat,
sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_nprocs_kstat,
sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
zone_kstat_delete_common(&zone->zone_mcap_ksp,
sizeof (zone_mcap_kstat_t));
zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2070,6 +2427,8 @@ zone_zsd_init(void)
zone0.zone_locked_mem_ctl = UINT64_MAX;
ASSERT(zone0.zone_max_swap == 0);
zone0.zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_phys_mem = 0;
+ zone0.zone_phys_mem_ctl = UINT64_MAX;
zone0.zone_max_lofi = 0;
zone0.zone_max_lofi_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
@@ -2094,8 +2453,9 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
-
+ zone0.zone_zfs_io_pri = 1;
zone0.zone_stime = 0;
zone0.zone_utime = 0;
zone0.zone_wtime = 0;
@@ -2206,6 +2566,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 16384, 16384, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2247,6 +2622,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2257,6 +2646,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2278,6 +2672,8 @@ zone_init(void)
zone0.zone_ntasks = 1;
mutex_exit(&p0.p_lock);
zone0.zone_restart_init = B_TRUE;
+ zone0.zone_reboot_on_init_exit = B_FALSE;
+ zone0.zone_init_status = -1;
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
@@ -2357,6 +2753,8 @@ zone_init(void)
static void
zone_free(zone_t *zone)
{
+ zone_dl_t *zdl;
+
ASSERT(zone != global_zone);
ASSERT(zone->zone_ntasks == 0);
ASSERT(zone->zone_nlwps == 0);
@@ -2385,6 +2783,19 @@ zone_free(zone_t *zone)
list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
+
+ /*
+ * While dlmgmtd should have removed all of these, it could have left
+ * something behind or crashed. In which case it's not safe for us to
+ * assume that the list is empty which list_destroy() will ASSERT. We
+ * clean up for our userland comrades which may have crashed, or worse,
+ * been disabled by SMF.
+ */
+ while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+ if (zdl->zdl_net != NULL)
+ nvlist_free(zdl->zdl_net);
+ kmem_free(zdl, sizeof (zone_dl_t));
+ }
list_destroy(&zone->zone_dl_list);
if (zone->zone_rootvp != NULL)
@@ -2429,12 +2840,18 @@ zone_free(zone_t *zone)
static void
zone_status_set(zone_t *zone, zone_status_t status)
{
+ timestruc_t now;
+ uint64_t t;
nvlist_t *nvl = NULL;
ASSERT(MUTEX_HELD(&zone_status_lock));
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
status >= zone_status_get(zone));
+ /* Current time since Jan 1 1970 but consumers expect NS */
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
@@ -2442,7 +2859,7 @@ zone_status_set(zone_t *zone, zone_status_t status)
nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
zone_status_table[zone->zone_status]) ||
nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
- nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
#ifdef DEBUG
@@ -2520,9 +2937,14 @@ zone_set_brand(zone_t *zone, const char *brand)
return (EINVAL);
}
- /* set up the brand specific data */
+ /*
+ * Set up the brand specific data.
+ * Note that it's possible that the hook has to drop the
+ * zone_status_lock and reaquire it before returning so we can't
+ * assume the lock has been held the entire time.
+ */
zone->zone_brand = bp;
- ZBROP(zone)->b_init_brand_data(zone);
+ ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
mutex_exit(&zone_status_lock);
return (0);
@@ -2594,14 +3016,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
return (0);
}
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats. Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats. We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
{
- uint64_t mcap;
- int err = 0;
+ zone->zone_mcap_nover++;
+
+ return (0);
+}
+
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+ uint64_t pageout;
+ int err;
+
+ if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+ zone->zone_mcap_pagedout += pageout;
+
+ return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults. This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+ uint32_t dusec;
+ int err;
+
+ if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+ zone->zone_pg_flt_delay = dusec;
+
+ return (err);
+}
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+ uint64_t rss;
+ int err;
+
+ if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mem = rss;
return (err);
}
@@ -3013,6 +3486,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -3756,6 +4235,17 @@ zone_start_init(void)
*/
z->zone_proc_initpid = p->p_pid;
+ if (z->zone_setup_app_contract == B_TRUE) {
+ /*
+ * Normally a process cannot modify its own contract, but we're
+ * just starting the zone's init process and its contract is
+ * always initialized from the sys_process_tmpl template, so
+ * this is the simplest way to setup init's contract to kill
+ * the process if any other process in the contract exits.
+ */
+ p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+ }
+
/*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
@@ -3784,9 +4274,54 @@ zone_start_init(void)
lwp_exit();
}
} else {
+ id_t cid = curthread->t_cid;
+
if (zone_status_get(z) == ZONE_IS_BOOTING)
zone_status_set(z, ZONE_IS_RUNNING);
mutex_exit(&zone_status_lock);
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ /*
+ * If the zone is using FX then by default all
+ * processes start at the lowest priority and stay
+ * there. We provide a mechanism for the zone to
+ * indicate that it should run at "high priority". In
+ * this case we setup init to run at the highest FX
+ * priority (which is one level higher than the
+ * non-fixed scheduling classes can use).
+ */
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ (void) parmsset(&pcparms, curthread);
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ /*
+ * zsched always starts the init lwp at priority
+ * minclsyspri - 1. This priority gets set in t_pri and
+ * is invalid for RT, but RT never uses t_pri. However
+ * t_pri is used by procfs, so we always see processes
+ * within an RT zone with an invalid priority value.
+ * We fix that up now.
+ */
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
/* cause the process to return to userland. */
lwp_rtt();
}
@@ -4272,8 +4807,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
error = EINVAL;
name = nvpair_name(nvp);
- if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
- != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+ if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+ strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+ nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4392,7 +4928,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4464,6 +5000,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zone->zone_id = zoneid;
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4471,6 +5008,8 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
zone->zone_restart_init = B_TRUE;
+ zone->zone_reboot_on_init_exit = B_FALSE;
+ zone->zone_init_status = -1;
zone->zone_brand = &native_brand;
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4538,10 +5077,14 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_locked_mem_ctl = UINT64_MAX;
zone->zone_max_swap = 0;
zone->zone_max_swap_ctl = UINT64_MAX;
+ zone->zone_phys_mem = 0;
+ zone->zone_phys_mem_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+ zone->zone_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
@@ -4696,8 +5239,8 @@ zone_create(const char *zone_name, const char *zone_root,
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
- * and initialize zsched appropriately. I'm not sure that that
- * makes much of a difference, though.
+ * and initialize zsched appropriately. However, we allow zoneadmd
+ * to pass down both zone and project rctls for the zone's init.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
@@ -4836,6 +5379,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4846,7 +5390,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5597,14 +6150,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5666,6 +6211,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ size = sizeof (boolean_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+ bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5697,10 +6259,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+ * attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID &&
+ attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
return (set_errno(EINVAL));
}
@@ -5717,7 +6280,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+ attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+ zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5739,12 +6304,21 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_FS_ALLOWED:
err = zone_set_fs_allowed(zone, (const char *)buf);
break;
+ case ZONE_ATTR_PMCAP_NOVER:
+ err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PMCAP_PAGEOUT:
+ err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PG_FLT_DELAY:
+ err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+ break;
+ case ZONE_ATTR_RSS:
+ err = zone_set_rss(zone, (const uint64_t *)buf);
+ break;
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
@@ -5772,6 +6346,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_APP_SVC_CT:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_setup_app_contract = (boolean_t)buf;
+ err = 0;
+ }
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_fixed_hipri = (boolean_t)buf;
+ err = 0;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6475,6 +7065,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6485,7 +7076,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6586,6 +7177,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
bcopy(zone->zone_name, zone_name, zone_namelen);
zoneid = zone->zone_id;
uniqid = zone->zone_uniqid;
+ arg.status = zone->zone_init_status;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
@@ -6763,7 +7355,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
@@ -6826,16 +7418,15 @@ zone_shutdown_global(void)
}
/*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
- zone_t *zone = curproc->p_zone;
const char *name = NULL;
vfs_t *vfsp = NULL;
@@ -6903,7 +7494,8 @@ zone_dataset_visible(const char *dataset, int *write)
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
- ASSERT(vfsp);
+ if (vfsp == NULL)
+ break;
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
@@ -6940,6 +7532,18 @@ zone_dataset_visible(const char *dataset, int *write)
}
/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_t *zone = curproc->p_zone;
+
+ return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
@@ -7164,6 +7768,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
mutex_exit(&zone->zone_lock);
zone_rele(zone);
+ /*
+ * Prevent returning negative nump values -- we should never
+ * have this many links anyways.
+ */
+ if (num > INT_MAX)
+ return (set_errno(EOVERFLOW));
+
/* Increased or decreased, caller should be notified. */
if (num != dlcount) {
if (copyout(&num, nump, sizeof (num)) != 0)