summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os')
-rw-r--r--usr/src/uts/common/os/acct.c19
-rw-r--r--usr/src/uts/common/os/brand.c185
-rw-r--r--usr/src/uts/common/os/clock_highres.c59
-rw-r--r--usr/src/uts/common/os/contract.c6
-rw-r--r--usr/src/uts/common/os/core.c4
-rw-r--r--usr/src/uts/common/os/cpu.c204
-rw-r--r--usr/src/uts/common/os/cred.c8
-rw-r--r--usr/src/uts/common/os/cyclic.c58
-rw-r--r--usr/src/uts/common/os/ddi_intr_irm.c2
-rw-r--r--usr/src/uts/common/os/exec.c121
-rw-r--r--usr/src/uts/common/os/exit.c319
-rw-r--r--usr/src/uts/common/os/fio.c35
-rw-r--r--usr/src/uts/common/os/fork.c52
-rw-r--r--usr/src/uts/common/os/grow.c34
-rw-r--r--usr/src/uts/common/os/id_space.c159
-rw-r--r--usr/src/uts/common/os/ipc.c26
-rw-r--r--usr/src/uts/common/os/kmem.c40
-rw-r--r--usr/src/uts/common/os/kstat_fr.c11
-rw-r--r--usr/src/uts/common/os/lgrp.c4
-rw-r--r--usr/src/uts/common/os/logsubr.c6
-rw-r--r--usr/src/uts/common/os/lwp.c129
-rw-r--r--usr/src/uts/common/os/main.c12
-rw-r--r--usr/src/uts/common/os/mem_config.c3
-rw-r--r--usr/src/uts/common/os/mmapobj.c13
-rw-r--r--usr/src/uts/common/os/modctl.c6
-rw-r--r--usr/src/uts/common/os/modsysfile.c27
-rw-r--r--usr/src/uts/common/os/pid.c27
-rw-r--r--usr/src/uts/common/os/policy.c32
-rw-r--r--usr/src/uts/common/os/priv_defs8
-rw-r--r--usr/src/uts/common/os/rctl.c32
-rw-r--r--usr/src/uts/common/os/rctl_proc.c28
-rw-r--r--usr/src/uts/common/os/sched.c15
-rw-r--r--usr/src/uts/common/os/schedctl.c18
-rw-r--r--usr/src/uts/common/os/shm.c41
-rw-r--r--usr/src/uts/common/os/sig.c100
-rw-r--r--usr/src/uts/common/os/smb_subr.c8
-rw-r--r--usr/src/uts/common/os/streamio.c46
-rw-r--r--usr/src/uts/common/os/strsubr.c7
-rw-r--r--usr/src/uts/common/os/sunddi.c14
-rw-r--r--usr/src/uts/common/os/sysent.c37
-rw-r--r--usr/src/uts/common/os/timer.c446
-rw-r--r--usr/src/uts/common/os/timers.c49
-rw-r--r--usr/src/uts/common/os/vm_pageout.c875
-rw-r--r--usr/src/uts/common/os/vmem.c2
-rw-r--r--usr/src/uts/common/os/zone.c1175
45 files changed, 3465 insertions, 1037 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c
index e598e0d08d..891c4e0836 100644
--- a/usr/src/uts/common/os/acct.c
+++ b/usr/src/uts/common/os/acct.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -47,6 +48,7 @@
#include <sys/time.h>
#include <sys/msacct.h>
#include <sys/zone.h>
+#include <sys/brand.h>
/*
* Each zone has its own accounting settings (on or off) and associated
@@ -373,7 +375,7 @@ acct_compress(ulong_t t)
* On exit, write a record on the accounting file.
*/
void
-acct(char st)
+acct(int st)
{
struct vnode *vp;
struct cred *cr;
@@ -402,6 +404,21 @@ acct(char st)
* This only gets called from exit after all lwp's have exited so no
* cred locking is needed.
*/
+
+ /* If there is a brand-specific hook, use it instead */
+ if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) {
+ ZBROP(curzone)->b_acct_out(vp, st);
+ mutex_exit(&ag->aclock);
+ return;
+ }
+
+ /*
+ * The 'st' status value was traditionally masked this way by our
+ * caller, but we now accept the unmasked value for brand handling.
+ * Zones not using the brand hook mask the status here.
+ */
+ st &= 0xff;
+
p = curproc;
ua = PTOU(p);
bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm));
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 0af67f5d98..62c3bbe2d6 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
#include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = {
};
#else /* !__sparcv9 */
struct brand_mach_ops native_mach_ops = {
- NULL, NULL, NULL, NULL
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
#endif /* !__sparcv9 */
@@ -53,7 +54,8 @@ brand_t native_brand = {
BRAND_VER_1,
"native",
NULL,
- &native_mach_ops
+ &native_mach_ops,
+ 0
};
/*
@@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)
mutex_exit(&brand_list_lock);
}
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
+ void *brand_data = NULL;
- ASSERT(bp != NULL);
- ASSERT(p->p_brand == &native_brand);
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
/*
- * We should only be called from exec(), when we know the process
- * is single-threaded.
+ * Process branding occurs during fork() and exec(). When it happens
+ * during fork(), the LWP count will always be 0 since branding is
+ * performed as part of getproc(), before LWPs have been associated.
+ * The same is not true during exec(), where a multi-LWP process may
+ * undergo branding just prior to gexec(). This is to ensure
+ * exec-related brand hooks are available. While it may seem
+ * complicated to brand a multi-LWP process, the two possible outcomes
+ * simplify things:
+ *
+ * 1. The exec() succeeds: LWPs besides the caller will be killed and
+ * any further branding will occur in a single-LWP context.
+ * 2. The exec() fails: The process will be promptly unbranded since
+ * the hooks are no longer needed.
+ *
+ * To prevent inconsistent brand state from being encountered during
+ * the exec(), LWPs beyond the caller which are associated with this
+ * process must be held temporarily. They will be released either when
+ * they are killed in the exec() success, or when the brand is cleared
+ * after exec() failure.
*/
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
+ if (lwps_ok) {
+ /*
+ * We've been called from a exec() context tolerating the
+ * existence of multiple LWPs during branding is necessary.
+ */
+ VERIFY(p == curproc);
+ VERIFY(p->p_tlist != NULL);
+ if (p->p_tlist != p->p_tlist->t_forw) {
+ /*
+ * Multiple LWPs are present. Hold all but the caller.
+ */
+ if (!holdlwps(SHOLDFORK1)) {
+ return (-1);
+ }
+ }
+ } else {
+ /*
+ * Processes branded during fork() should not have LWPs at all.
+ */
+ VERIFY(p->p_tlist == NULL);
+ }
+
+ if (bp->b_data_size > 0) {
+ brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+ }
+
+ mutex_enter(&p->p_lock);
+ ASSERT(!PROC_IS_BRANDED(p));
p->p_brand = bp;
+ p->p_brand_data = brand_data;
ASSERT(PROC_IS_BRANDED(p));
BROP(p)->b_setbrand(p);
+ mutex_exit(&p->p_lock);
+ return (0);
}
void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
{
brand_t *bp = p->p_zone->zone_brand;
- klwp_t *lwp = NULL;
- ASSERT(bp != NULL);
- ASSERT(!no_lwps || (p->p_tlist == NULL));
+ void *brand_data;
- /*
- * If called from exec_common() or proc_exit(),
- * we know the process is single-threaded.
- * If called from fork_fail, p_tlist is NULL.
- */
- if (!no_lwps) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- lwp = p->p_tlist->t_lwp;
- }
+ VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+ VERIFY(bp != NULL);
+ VERIFY(PROC_IS_BRANDED(p));
- ASSERT(PROC_IS_BRANDED(p));
- BROP(p)->b_proc_exit(p, lwp);
+ if (BROP(p)->b_clearbrand != NULL)
+ BROP(p)->b_clearbrand(p, lwps_ok);
+
+ mutex_enter(&p->p_lock);
p->p_brand = &native_brand;
+ brand_data = p->p_brand_data;
+ p->p_brand_data = NULL;
+
+ if (lwps_ok) {
+ VERIFY(p == curproc);
+ /*
+ * A process with multiple LWPs is being de-branded after
+ * failing an exec. The other LWPs were held as part of the
+ * procedure, so they must be resumed now.
+ */
+ if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+ continuelwps(p);
+ }
+ } else {
+ /*
+ * While clearing the brand, it's ok for one LWP to be present.
+ * This happens when a native binary is executed inside a
+ * branded zone, since the brand will be removed during the
+ * course of a successful exec.
+ */
+ VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+ }
+ mutex_exit(&p->p_lock);
+
+ if (brand_data != NULL) {
+ kmem_free(brand_data, bp->b_data_size);
+ }
}
#if defined(__sparcv9)
@@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
return (ENOSYS);
/* For all other operations this must be a branded process. */
- if (p->p_brand == &native_brand)
+ if (!PROC_IS_BRANDED(p))
return (ENOSYS);
ASSERT(p->p_brand == pbrand);
@@ -601,15 +672,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
int
brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
- cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
- char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+ cred_t *cred, int *brand_action, struct brand *pbrand, char *bname,
+ char *brandlib, char *brandlib32)
{
vnode_t *nvp;
Ehdr ehdr;
Addr uphdr_vaddr;
intptr_t voffset;
- int interp;
+ char *interp;
int i, err;
struct execenv env;
struct execenv origenv;
@@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
klwp_t *lwp = ttolwp(curthread);
brand_proc_data_t *spd;
brand_elf_data_t sed, *sedp;
- char *linker;
uintptr_t lddata; /* lddata of executable's linker */
ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
*/
if (args->to_model == DATAMODEL_NATIVE) {
args->emulator = brandlib;
- linker = brandlinker;
}
#if defined(_LP64)
else {
args->emulator = brandlib32;
- linker = brandlinker32;
}
#endif /* _LP64 */
@@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
}
#if defined(_LP64)
else {
@@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
&voffset, exec_file, &interp, &env.ex_bssbase,
- &env.ex_brkbase, &env.ex_brksize, NULL);
+ &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
#endif /* _LP64 */
if (err != 0) {
restoreexecenv(&origenv, &orig_sigaltstack);
+
+ if (interp != NULL)
+ kmem_free(interp, MAXPATHLEN);
+
return (err);
}
@@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
sedp->sed_phent = ehdr.e_phentsize;
sedp->sed_phnum = ehdr.e_phnum;
- if (interp) {
+ if (interp != NULL) {
if (ehdr.e_type == ET_DYN) {
/*
* This is a shared object executable, so we
@@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
* it in and store relevant information about it in the
* aux vector, where the brand library can find it.
*/
- if ((err = lookupname(linker, UIO_SYSSPACE,
+ if ((err = lookupname(interp, UIO_SYSSPACE,
FOLLOW, NULLVPP, &nvp)) != 0) {
- uprintf("%s: not found.", brandlinker);
+ uprintf("%s: not found.", interp);
restoreexecenv(&origenv, &orig_sigaltstack);
+ kmem_free(interp, MAXPATHLEN);
return (err);
}
+
+ kmem_free(interp, MAXPATHLEN);
+
if (args->to_model == DATAMODEL_NATIVE) {
err = mapexec_brand(nvp, args, &ehdr,
&uphdr_vaddr, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
}
#if defined(_LP64)
else {
@@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
Elf32_Addr uphdr_vaddr32;
err = mapexec32_brand(nvp, args, &ehdr32,
&uphdr_vaddr32, &voffset, exec_file, &interp,
- NULL, NULL, NULL, &lddata);
+ NULL, NULL, NULL, &lddata, NULL);
Ehdr32to64(&ehdr32, &ehdr);
if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
/*
* Third, the /proc aux vectors set up by elfexec() point to
- * brand emulation library and it's linker. Copy these to the
+ * brand emulation library and its linker. Copy these to the
* /proc brand specific aux vector, and update the regular
- * /proc aux vectors to point to the executable (and it's
+ * /proc aux vectors to point to the executable (and its
* linker). This will enable debuggers to access the
* executable via the usual /proc or elf notes aux vectors.
*
@@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
}
/*ARGSUSED*/
-int
+void
brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
{
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand == NULL);
l->lwp_brand = (void *)-1;
- return (0);
}
/*ARGSUSED*/
void
brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
{
- proc_t *p = l->lwp_procp;
-
ASSERT(l->lwp_procp->p_brand == pbrand);
ASSERT(l->lwp_procp->p_brand_data != NULL);
ASSERT(l->lwp_brand != NULL);
-
- /*
- * We should never be called for the last thread in a process.
- * (That case is handled by brand_solaris_proc_exit().)
- * Therefore this lwp must be exiting from a multi-threaded
- * process.
- */
- ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
- l->lwp_brand = NULL;
}
/*ARGSUSED*/
void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
{
ASSERT(p->p_brand == pbrand);
ASSERT(p->p_brand_data != NULL);
- /*
- * When called from proc_exit(), we know that process is
- * single-threaded and free our lwp brand data.
- * otherwise just free p_brand_data and return.
- */
- if (l != NULL) {
- ASSERT(p->p_tlist == p->p_tlist->t_forw);
- ASSERT(p->p_tlist->t_lwp == l);
- (void) brand_solaris_freelwp(l, pbrand);
- }
-
/* upon exit, free our proc brand data */
kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
p->p_brand_data = NULL;
@@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
ASSERT(p->p_tlist == p->p_tlist->t_forw);
p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
- (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
}
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 805813037d..1280c8a1b6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
*/
#include <sys/timer.h>
@@ -41,6 +41,9 @@
static clock_backend_t clock_highres;
+/* minimum non-privileged interval (200us) */
+long clock_highres_interval_min = 200000;
+
/*ARGSUSED*/
static int
clock_highres_settime(timespec_t *ts)
@@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)
static int
clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
{
- /*
- * CLOCK_HIGHRES timers of sufficiently high resolution can deny
- * service; only allow privileged users to create such timers.
- * Sites that do not wish to have this restriction should
- * give users the "proc_clock_highres" privilege.
- */
- if (secpolicy_clock_highres(CRED()) != 0) {
- it->it_arg = NULL;
- return (EPERM);
- }
-
it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
it->it_fire = fire;
@@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,
cpu_t *cpu;
cpupart_t *cpupart;
int pset;
+ boolean_t value_need_clamp = B_FALSE;
+ boolean_t intval_need_clamp = B_FALSE;
+ cred_t *cr = CRED();
+ struct itimerspec clamped;
+
+ /*
+ * CLOCK_HIGHRES timers of sufficiently high resolution can deny
+ * service; only allow privileged users to create such timers.
+ * Non-privileged users (those without the "proc_clock_highres"
+ * privilege) can create timers with lower resolution but if they
+ * attempt to use a very low time value (< 200us) then their
+ * timer will be clamped at 200us.
+ */
+ if (when->it_value.tv_sec == 0 &&
+ when->it_value.tv_nsec > 0 &&
+ when->it_value.tv_nsec < clock_highres_interval_min)
+ value_need_clamp = B_TRUE;
+
+ if (when->it_interval.tv_sec == 0 &&
+ when->it_interval.tv_nsec > 0 &&
+ when->it_interval.tv_nsec < clock_highres_interval_min)
+ intval_need_clamp = B_TRUE;
+
+ if ((value_need_clamp || intval_need_clamp) &&
+ secpolicy_clock_highres(cr) != 0) {
+ clamped.it_value.tv_sec = when->it_value.tv_sec;
+ clamped.it_interval.tv_sec = when->it_interval.tv_sec;
+
+ if (value_need_clamp) {
+ clamped.it_value.tv_nsec = clock_highres_interval_min;
+ } else {
+ clamped.it_value.tv_nsec = when->it_value.tv_nsec;
+ }
+
+ if (intval_need_clamp) {
+ clamped.it_interval.tv_nsec =
+ clock_highres_interval_min;
+ } else {
+ clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
+ }
+
+ when = &clamped;
+ }
cyctime.cyt_when = ts2hrt(&when->it_value);
cyctime.cyt_interval = ts2hrt(&when->it_interval);
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 909a6c2860..1a3502a710 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
@@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
avl_index_t where;
klwp_t *curlwp = ttolwp(curthread);
- ASSERT(author == curproc);
+ /*
+ * It's possible that author is not curproc if the zone is creating
+ * a new process as a child of zsched.
+ */
mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d5e272c16a..437f26e6e0 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)
/*
* Determine what rootvp to use.
*/
+ mutex_enter(&curproc->p_lock);
if (core_type == CORE_PROC) {
rootvp = (PTOU(curproc)->u_rdir == NULL ?
curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir);
@@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)
VN_HOLD(startvp);
if (rootvp != rootdir)
VN_HOLD(rootvp);
+ mutex_exit(&curproc->p_lock);
if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,
startvp, CRED())) != 0) {
pn_free(&pn);
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 87c0896814..4648dae9dd 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -108,7 +109,8 @@ kmutex_t cpu_lock;
cpu_t *cpu_list; /* list of all CPUs */
cpu_t *clock_cpu_list; /* used by clock to walk CPUs */
cpu_t *cpu_active; /* list of active CPUs */
-static cpuset_t cpu_available; /* set of available CPUs */
+cpuset_t cpu_active_set; /* cached set of active CPUs */
+cpuset_t cpu_available; /* set of available CPUs */
cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */
cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */
@@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp)
/*
* Set affinity for a specified CPU.
- * A reference count is incremented and the affinity is held until the
- * reference count is decremented to zero by thread_affinity_clear().
- * This is so regions of code requiring affinity can be nested.
- * Caller needs to ensure that cpu_id remains valid, which can be
- * done by holding cpu_lock across this call, unless the caller
- * specifies CPU_CURRENT in which case the cpu_lock will be acquired
- * by thread_affinity_set and CPU->cpu_id will be the target CPU.
+ *
+ * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for
+ * curthread, will set affinity to the CPU on which the thread is currently
+ * running. For other cpu_id values, the caller must ensure that the
+ * referenced CPU remains valid, which can be done by holding cpu_lock across
+ * this call.
+ *
+ * CPU affinity is guaranteed after return of thread_affinity_set(). If a
+ * caller setting affinity to CPU_CURRENT requires that its thread not migrate
+ * CPUs prior to a successful return, it should take extra precautions (such as
+ * their own call to kpreempt_disable) to ensure that safety.
+ *
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
+ * A CPU affinity reference count is maintained by thread_affinity_set and
+ * thread_affinity_clear (incrementing and decrementing it, respectively),
+ * maintaining CPU affinity while the count is non-zero, and allowing regions
+ * of code which require affinity to be nested.
*/
void
thread_affinity_set(kthread_id_t t, int cpu_id)
{
- cpu_t *cp;
- int c;
+ cpu_t *cp;
ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));
- if ((c = cpu_id) == CPU_CURRENT) {
- mutex_enter(&cpu_lock);
- cpu_id = CPU->cpu_id;
+ if (cpu_id == CPU_CURRENT) {
+ VERIFY3P(t, ==, curthread);
+ kpreempt_disable();
+ cp = CPU;
+ } else if (cpu_id == CPU_BEST) {
+ VERIFY3P(t, ==, curthread);
+ kpreempt_disable();
+ cp = disp_choose_best_cpu();
+ } else {
+ /*
+ * We should be asserting that cpu_lock is held here, but
+ * the NCA code doesn't acquire it. The following assert
+ * should be uncommented when the NCA code is fixed.
+ *
+ * ASSERT(MUTEX_HELD(&cpu_lock));
+ */
+ VERIFY((cpu_id >= 0) && (cpu_id < NCPU));
+ cp = cpu[cpu_id];
+
+ /* user must provide a good cpu_id */
+ VERIFY(cp != NULL);
}
- /*
- * We should be asserting that cpu_lock is held here, but
- * the NCA code doesn't acquire it. The following assert
- * should be uncommented when the NCA code is fixed.
- *
- * ASSERT(MUTEX_HELD(&cpu_lock));
- */
- ASSERT((cpu_id >= 0) && (cpu_id < NCPU));
- cp = cpu[cpu_id];
- ASSERT(cp != NULL); /* user must provide a good cpu_id */
+
/*
* If there is already a hard affinity requested, and this affinity
* conflicts with that, panic.
@@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
* Make sure we're running on the right CPU.
*/
if (cp != t->t_cpu || t != curthread) {
+ ASSERT(cpu_id != CPU_CURRENT);
force_thread_migrate(t); /* drops thread lock */
} else {
thread_unlock(t);
}
- if (c == CPU_CURRENT)
- mutex_exit(&cpu_lock);
+ if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
+ kpreempt_enable();
}
/*
@@ -1473,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
if (t->t_cpu == cp && t->t_bound_cpu != cp)
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
+ t->t_cpu = disp_lowpri_cpu(ncp, t,
+ t->t_pri);
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
@@ -1516,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
* Update CPU last ran on if it was this CPU
*/
- if (t->t_cpu == cp && t->t_bound_cpu != cp) {
- t->t_cpu = disp_lowpri_cpu(ncp,
- t->t_lpl, t->t_pri, NULL);
- }
+ if (t->t_cpu == cp && t->t_bound_cpu != cp)
+ t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
t->t_weakbound_cpu == cp);
t = t->t_next;
@@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp)
cp->cpu_part = &cp_default;
CPUSET_ADD(cpu_available, cp->cpu_id);
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
}
/*
@@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp)
cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
cpu_active->cpu_prev_onln->cpu_next_onln = cp;
cpu_active->cpu_prev_onln = cp;
+ CPUSET_ADD(cpu_active_set, cp->cpu_id);
if (pp->cp_cpulist) {
cp->cpu_next_part = pp->cp_cpulist;
@@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp)
}
cp->cpu_next_onln = cp;
cp->cpu_prev_onln = cp;
+ CPUSET_DEL(cpu_active_set, cp->cpu_id);
cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
@@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
return (0);
}
-#if CPUSET_WORDS > 1
-/*
- * Functions for implementing cpuset operations when a cpuset is more
- * than one word. On platforms where a cpuset is a single word these
- * are implemented as macros in cpuvar.h.
- */
+cpuset_t *
+cpuset_alloc(int kmflags)
+{
+ return (kmem_alloc(sizeof (cpuset_t), kmflags));
+}
+
+void
+cpuset_free(cpuset_t *s)
+{
+ kmem_free(s, sizeof (cpuset_t));
+}
void
cpuset_all(cpuset_t *s)
@@ -2722,38 +2752,61 @@ cpuset_all(cpuset_t *s)
}
void
-cpuset_all_but(cpuset_t *s, uint_t cpu)
+cpuset_all_but(cpuset_t *s, const uint_t cpu)
{
cpuset_all(s);
CPUSET_DEL(*s, cpu);
}
void
-cpuset_only(cpuset_t *s, uint_t cpu)
+cpuset_only(cpuset_t *s, const uint_t cpu)
{
CPUSET_ZERO(*s);
CPUSET_ADD(*s, cpu);
}
+long
+cpu_in_set(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ return (BT_TEST(s->cpub, cpu));
+}
+
+void
+cpuset_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_SET(s->cpub, cpu);
+}
+
+void
+cpuset_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_CLEAR(s->cpub, cpu);
+}
+
int
cpuset_isnull(cpuset_t *s)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s->cpub[i] != 0)
return (0);
+ }
return (1);
}
int
-cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
+cpuset_isequal(cpuset_t *s1, cpuset_t *s2)
{
int i;
- for (i = 0; i < CPUSET_WORDS; i++)
+ for (i = 0; i < CPUSET_WORDS; i++) {
if (s1->cpub[i] != s2->cpub[i])
return (0);
+ }
return (1);
}
@@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
*smallestid = *largestid = CPUSET_NOTINSET;
}
-#endif /* CPUSET_WORDS */
+void
+cpuset_atomic_del(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR(s->cpub, (cpu))
+}
+
+void
+cpuset_atomic_add(cpuset_t *s, const uint_t cpu)
+{
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET(s->cpub, (cpu))
+}
+
+long
+cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_SET_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+long
+cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
+{
+ long res;
+
+ VERIFY(cpu < NCPU);
+ BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res);
+ return (res);
+}
+
+void
+cpuset_or(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] |= src->cpub[i];
+ }
+}
+
+void
+cpuset_xor(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] ^= src->cpub[i];
+ }
+}
+
+void
+cpuset_and(cpuset_t *dst, cpuset_t *src)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] &= src->cpub[i];
+ }
+}
+
+void
+cpuset_zero(cpuset_t *dst)
+{
+ for (int i = 0; i < CPUSET_WORDS; i++) {
+ dst->cpub[i] = 0;
+ }
+}
+
/*
* Unbind threads bound to specified CPU.
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 25727d54c5..0bd6cfd44f 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)
cr->cr_zone->zone_id);
}
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+ return (cr->cr_zone == NULL ?
+ (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+ cr->cr_zone->zone_did);
+}
+
projid_t
crgetprojid(const cred_t *cr)
{
diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c
index 21907b4957..45e13ebeab 100644
--- a/usr/src/uts/common/os/cyclic.c
+++ b/usr/src/uts/common/os/cyclic.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2018 Joyent Inc.
*/
/*
@@ -112,6 +112,7 @@
* cyclic_remove() <-- Removes a cyclic
* cyclic_bind() <-- Change a cyclic's CPU or partition binding
* cyclic_reprogram() <-- Reprogram a cyclic's expiration
+ * cyclic_move_here() <-- Shuffle cyclic to current CPU
*
* Inter-subsystem Interfaces
*
@@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
return (1);
}
+/*
+ * void cyclic_move_here(cyclic_id_t)
+ *
+ * Overview
+ *
+ * cyclic_move_here() attempts to shuffle a cyclic onto the current CPU.
+ *
+ * Arguments and notes
+ *
+ * The first argument is a cyclic_id returned from cyclic_add().
+ * cyclic_move_here() may _not_ be called on a cyclic_id returned from
+ * cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind().
+ *
+ * This cyclic shuffling is performed on a best-effort basis. If for some
+ * reason the current CPU is unsuitable or the thread migrates between CPUs
+ * during the call, the function may return with the cyclic residing on some
+ * other CPU.
+ *
+ * Return value
+ *
+ * None; cyclic_move_here() always reports success.
+ *
+ * Caller's context
+ *
+ * cpu_lock must be held by the caller, and the caller must not be in
+ * interrupt context. The caller may not hold any locks which are also
+ * grabbed by any cyclic handler.
+ */
+void
+cyclic_move_here(cyclic_id_t id)
+{
+ cyc_id_t *idp = (cyc_id_t *)id;
+ cyc_cpu_t *cc = idp->cyi_cpu;
+ cpu_t *dest = CPU;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ CYC_PTRACE("move_here", idp, dest);
+ VERIFY3P(cc, !=, NULL);
+ VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags &
+ (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0);
+
+ if (cc->cyp_cpu == dest) {
+ return;
+ }
+
+ /* Is the destination CPU suitable for a migration target? */
+ if (dest->cpu_cyclic == NULL ||
+ dest->cpu_cyclic->cyp_state == CYS_OFFLINE ||
+ (dest->cpu_flags & CPU_ENABLE) == 0) {
+ return;
+ }
+
+ cyclic_juggle_one_to(idp, dest->cpu_cyclic);
+}
+
hrtime_t
cyclic_getres()
{
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
/* Log callback errors */
if (ret != DDI_SUCCESS) {
- cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+ cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
ddi_driver_name(req_p->ireq_dip),
ddi_get_instance(req_p->ireq_dip), (int)action, ret);
}
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index 53c552f135..96b6081489 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */
#endif
#define PSUIDFLAGS (SNOCD|SUGID)
+#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */
/*
* These are consumed within the specific exec modules, but are defined here
@@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
* only if the pathname does not contain a "/" the resolved path
* points to a file in the current working (attribute) directory.
*/
- if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
+ mutex_enter(&p->p_lock);
+ if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&
strchr(resolvepn.pn_path, '/') == NULL) {
+ mutex_exit(&p->p_lock);
if (dir != NULL)
VN_RELE(dir);
error = EACCES;
@@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
VN_RELE(vp);
goto out;
}
+ mutex_exit(&p->p_lock);
bzero(exec_file, MAXCOMLEN+1);
(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
@@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
ua.argp = argp;
ua.envp = envp;
- /* If necessary, brand this process before we start the exec. */
- if (brandme)
- brand_setbrand(p);
+ /* If necessary, brand this process/lwp before we start the exec. */
+ if (brandme) {
+ void *brand_data = NULL;
+
+ /*
+ * Process branding may fail if multiple LWPs are present and
+ * holdlwps() cannot complete successfully.
+ */
+ error = brand_setbrand(p, B_TRUE);
+
+ if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+ brand_data = BROP(p)->b_lwpdata_alloc(p);
+ if (brand_data == NULL) {
+ error = 1;
+ }
+ }
+
+ if (error == 0) {
+ mutex_enter(&p->p_lock);
+ BROP(p)->b_initlwp(lwp, brand_data);
+ mutex_exit(&p->p_lock);
+ } else {
+ VN_RELE(vp);
+ if (dir != NULL) {
+ VN_RELE(dir);
+ }
+ pn_free(&resolvepn);
+ goto fail;
+ }
+ }
if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
- exec_file, p->p_cred, brand_action)) != 0) {
- if (brandme)
- brand_clearbrand(p, B_FALSE);
+ exec_file, p->p_cred, &brand_action)) != 0) {
+ if (brandme) {
+ BROP(p)->b_freelwp(lwp);
+ brand_clearbrand(p, B_TRUE);
+ }
VN_RELE(vp);
if (dir != NULL)
VN_RELE(dir);
@@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
/*
* Clear contract template state
*/
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_TRUE);
/*
* Save the directory in which we found the executable for expanding
@@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
* pending held signals remain held, so don't clear t_hold.
*/
mutex_enter(&p->p_lock);
+ DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+ uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
lwp->lwp_oldcontext = 0;
lwp->lwp_ustack = 0;
lwp->lwp_old_stk_ctl = 0;
@@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
/* Unbrand ourself if necessary. */
- if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+ if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+ BROP(p)->b_freelwp(lwp);
brand_clearbrand(p, B_FALSE);
+ }
setregs(&args);
@@ -569,7 +606,7 @@ gexec(
long *execsz,
caddr_t exec_file,
struct cred *cred,
- int brand_action)
+ int *brand_action)
{
struct vnode *vp, *execvp = NULL;
proc_t *pp = ttoproc(curthread);
@@ -890,8 +927,14 @@ gexec(
if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
args->traceinval = 1;
}
- if (pp->p_proc_flag & P_PR_PTRACE)
+
+ /*
+ * If legacy ptrace is enabled, generate the SIGTRAP.
+ */
+ if (pp->p_proc_flag & P_PR_PTRACE) {
psignal(pp, SIGTRAP);
+ }
+
if (args->traceinval)
prinvalidate(&pp->p_user);
}
@@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
return (0);
}
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+ int error;
+
+ if (STK_AVAIL(args) < sizeof (int))
+ return (E2BIG);
+ *--args->stk_offp = args->stk_strp - args->stk_base;
+
+ if (len > STK_AVAIL(args))
+ return (E2BIG);
+ bcopy(sp, args->stk_strp, len);
+
+ args->stk_strp += len;
+
+ return (0);
+}
+
static int
stk_getptr(uarg_t *args, char *src, char **dst)
{
@@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
size_t size, pad;
char *argv = (char *)uap->argp;
char *envp = (char *)uap->envp;
+ uint8_t rdata[RANDOM_LEN];
/*
* Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
args->ne = args->na - argc;
/*
- * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
- * AT_SUN_EMULATOR strings to the stack.
+ * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+ * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+ * array, to the stack.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
if (args->emulator != NULL &&
(error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
return (error);
+
+ /*
+ * For the AT_RANDOM aux vector we provide 16 bytes of random
+ * data.
+ */
+ (void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+ if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+ return (error);
+
+ if (args->brand_nroot != NULL &&
+ (error = stk_add(args, args->brand_nroot,
+ UIO_SYSSPACE)) != 0)
+ return (error);
}
/*
@@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
/*
* Fill in the aux vector now that we know the user stack addresses
* for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
- * AT_SUN_EMULATOR strings.
+ * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
*/
if (auxvpp != NULL && *auxvpp != NULL) {
if (args->to_model == DATAMODEL_NATIVE) {
@@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a,
AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a,
+ AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+ }
} else {
auxv32_t **a = (auxv32_t **)auxvpp;
ADDAUX(*a,
@@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
if (args->emulator != NULL)
ADDAUX(*a, AT_SUN_EMULATOR,
(int)(uintptr_t)&ustrp[*--offp])
+ ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+ if (args->brand_nroot != NULL) {
+ ADDAUX(*a, AT_SUN_BRAND_NROOT,
+ (int)(uintptr_t)&ustrp[*--offp])
+ }
}
}
@@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
usrstack = (char *)USRSTACK32;
}
+ if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+ usrstack = (char *)args->maxstack;
+
ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
#if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 1b9359da47..06e0117cd6 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -138,6 +138,27 @@ rexit(int rval)
}
/*
+ * Bump the init_restarts kstat and let interested parties know about the
+ * restart.
+ */
+static void
+restart_init_notify(zone_t *zone)
+{
+ nvlist_t *nvl = NULL;
+
+ zone->zone_proc_init_restarts++;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 &&
+ nvlist_add_uint32(nvl, ZONE_CB_RESTARTS,
+ zone->zone_proc_init_restarts) == 0) {
+ zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS,
+ ZONE_EVENT_INIT_RESTART_SC, nvl);
+ }
+
+ nvlist_free(nvl);
+}
+
+/*
* Called by proc_exit() when a zone's init exits, presumably because
* it failed. As long as the given zone is still in the "running"
* state, we will re-exec() init, but first we need to reset things
@@ -230,7 +251,7 @@ restart_init(int what, int why)
siginfofree(lwp->lwp_curinfo);
lwp->lwp_curinfo = NULL;
}
- lwp_ctmpl_clear(lwp);
+ lwp_ctmpl_clear(lwp, B_FALSE);
/*
* Reset both the process root directory and the current working
@@ -260,6 +281,8 @@ restart_init(int what, int why)
ASSERT(p == curproc);
(void) freectty(B_TRUE);
+ restart_init_notify(p->p_zone);
+
/*
* Now exec() the new init(1M) on top of the current process. If we
* succeed, the caller will treat this like a successful system call.
@@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p)
}
/*
+ * Return true if zone's init is restarted, false if exit processing should
+ * proceeed.
+ */
+static boolean_t
+zone_init_exit(zone_t *z, int why, int what)
+{
+ /*
+ * Typically we don't let the zone's init exit unless zone_start_init()
+ * failed its exec, or we are shutting down the zone or the machine,
+ * although the various flags handled within this function will control
+ * the behavior.
+ *
+ * Since we are single threaded, we don't need to lock the following
+ * accesses to zone_proc_initpid.
+ */
+ if (z->zone_boot_err != 0 ||
+ zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN ||
+ zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+ /*
+ * Clear the zone's init pid and proceed with exit processing.
+ */
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ /*
+ * There are a variety of configuration flags on the zone to control
+ * init exit behavior.
+ *
+ * If the init process should be restarted, the "zone_restart_init"
+ * member will be set.
+ */
+ if (!z->zone_restart_init) {
+ /*
+ * The zone has been setup to halt when init exits.
+ */
+ z->zone_init_status = wstat(why, what);
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ /*
+ * At this point we know we're configured to restart init, but there
+ * are various modifiers to that behavior.
+ */
+
+ if (z->zone_reboot_on_init_exit) {
+ /*
+ * Some init programs in branded zones do not tolerate a
+ * restart in the traditional manner; setting
+ * "zone_reboot_on_init_exit" will cause the entire zone to be
+ * rebooted instead.
+ */
+
+ if (z->zone_restart_init_0) {
+ /*
+ * Some init programs in branded zones only want to
+ * restart if they exit 0, otherwise the zone should
+ * shutdown. Setting the "zone_restart_init_0" member
+ * controls this behavior.
+ */
+ if (why == CLD_EXITED && what == 0) {
+ /* Trigger a zone reboot */
+ (void) zone_kadmin(A_REBOOT, 0, NULL,
+ zone_kcred());
+ } else {
+ /* Shutdown instead of reboot */
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
+ zone_kcred());
+ }
+ } else {
+ /* Trigger a zone reboot */
+ (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred());
+ }
+
+ z->zone_init_status = wstat(why, what);
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+ }
+
+ if (z->zone_restart_init_0) {
+ /*
+ * Some init programs in branded zones only want to restart if
+ * they exit 0, otherwise the zone should shutdown. Setting the
+ * "zone_restart_init_0" member controls this behavior.
+ *
+ * In this case we only restart init if it exited successfully.
+ */
+ if (why == CLD_EXITED && what == 0 &&
+ restart_init(what, why) == 0) {
+ return (B_TRUE);
+ }
+ } else {
+ /*
+ * No restart modifiers on the zone, attempt to restart init.
+ */
+ if (restart_init(what, why) == 0) {
+ return (B_TRUE);
+ }
+ }
+
+
+ /*
+ * The restart failed, the zone will shut down.
+ */
+ z->zone_init_status = wstat(why, what);
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+ z->zone_proc_initpid = -1;
+ return (B_FALSE);
+}
+
+/*
* Return value:
* 1 - exitlwps() failed, call (or continue) lwp_exit()
* 0 - restarting init. Return through system call path
@@ -366,45 +502,36 @@ proc_exit(int why, int what)
}
mutex_exit(&p->p_lock);
- DTRACE_PROC(lwp__exit);
- DTRACE_PROC1(exit, int, why);
+ if (p->p_pid == z->zone_proc_initpid) {
+ /* If zone's init restarts, we're done here. */
+ if (zone_init_exit(z, why, what))
+ return (0);
+ }
/*
- * Will perform any brand specific proc exit processing, since this
- * is always the last lwp, will also perform lwp_exit and free brand
- * data
+ * Delay firing probes (and performing brand cleanup) until after the
+ * zone_proc_initpid check. Cases which result in zone shutdown or
+ * restart via zone_kadmin eventually result in a call back to
+ * proc_exit.
*/
- if (PROC_IS_BRANDED(p)) {
- lwp_detach_brand_hdlrs(lwp);
- brand_clearbrand(p, B_FALSE);
- }
+ DTRACE_PROC(lwp__exit);
+ DTRACE_PROC1(exit, int, why);
/*
- * Don't let init exit unless zone_start_init() failed its exec, or
- * we are shutting down the zone or the machine.
- *
- * Since we are single threaded, we don't need to lock the
- * following accesses to zone_proc_initpid.
+ * Will perform any brand specific proc exit processing. Since this
+ * is always the last lwp, will also perform lwp exit/free and proc
+ * exit. Brand data will be freed when the process is reaped.
*/
- if (p->p_pid == z->zone_proc_initpid) {
- if (z->zone_boot_err == 0 &&
- zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
- zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
- if (z->zone_restart_init == B_TRUE) {
- if (restart_init(what, why) == 0)
- return (0);
- } else {
- (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
- CRED());
- }
- }
-
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_proc_exit(p);
/*
- * Since we didn't or couldn't restart init, we clear
- * the zone's init state and proceed with exit
- * processing.
+ * To ensure that b_proc_exit has access to brand-specific data
+ * contained by the one remaining lwp, call the freelwp hook as
+ * the last part of this clean-up process.
*/
- z->zone_proc_initpid = -1;
+ BROP(p)->b_freelwp(lwp);
+ lwp_detach_brand_hdlrs(lwp);
}
lwp_pcb_exit();
@@ -565,7 +692,7 @@ proc_exit(int why, int what)
semexit(p);
rv = wstat(why, what);
- acct(rv & 0xff);
+ acct(rv);
exacct_commit_proc(p, rv);
/*
@@ -658,10 +785,22 @@ proc_exit(int why, int what)
if ((q = p->p_child) != NULL && p != proc_init) {
struct proc *np;
struct proc *initp = proc_init;
+ pid_t zone_initpid = 1;
+ struct proc *zoneinitp = NULL;
boolean_t setzonetop = B_FALSE;
- if (!INGLOBALZONE(curproc))
- setzonetop = B_TRUE;
+ if (!INGLOBALZONE(curproc)) {
+ zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+ zoneinitp = prfind(zone_initpid);
+ if (zoneinitp != NULL) {
+ initp = zoneinitp;
+ } else {
+ zone_initpid = 1;
+ setzonetop = B_TRUE;
+ }
+ }
pgdetach(p);
@@ -673,7 +812,8 @@ proc_exit(int why, int what)
*/
delete_ns(q->p_parent, q);
- q->p_ppid = 1;
+ q->p_ppid = zone_initpid;
+
q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
if (setzonetop) {
mutex_enter(&q->p_lock);
@@ -847,8 +987,50 @@ proc_exit(int why, int what)
mutex_exit(&p->p_lock);
if (!evaporate) {
- p->p_pidflag &= ~CLDPEND;
- sigcld(p, sqp);
+ /*
+ * The brand specific code only happens when the brand has a
+ * function to call in place of sigcld and the parent of the
+ * exiting process is not the global zone init. If the parent
+ * is the global zone init, then the process was reparented,
+ * and we don't want brand code delivering possibly strange
+ * signals to init. Also, init is not branded, so any brand
+ * specific exit data will not be picked up by init anyway.
+ */
+ if (PROC_IS_BRANDED(p) &&
+ BROP(p)->b_exit_with_sig != NULL &&
+ p->p_ppid != 1) {
+ /*
+ * The code for _fini that could unload the brand_t
+ * blocks until the count of zones using the module
+ * reaches zero. Zones decrement the refcount on their
+ * brands only after all user tasks in that zone have
+ * exited and been waited on. The decrement on the
+ * brand's refcount happen in zone_destroy(). That
+ * depends on zone_shutdown() having been completed.
+ * zone_shutdown() includes a call to zone_empty(),
+ * where the zone waits for itself to reach the state
+ * ZONE_IS_EMPTY. This state is only set in either
+ * zone_shutdown(), when there are no user processes as
+ * the zone enters this function, or in
+ * zone_task_rele(). zone_task_rele() is called from
+ * code triggered by waiting on processes, not by the
+ * processes exiting through proc_exit(). This means
+ * all the branded processes that could exist for a
+ * specific brand_t must exit and get reaped before the
+ * refcount on the brand_t can reach 0. _fini will
+ * never unload the corresponding brand module before
+ * proc_exit finishes execution for all processes
+ * branded with a particular brand_t, which makes the
+ * operation below safe to do. Brands that wish to use
+ * this mechanism must wait in _fini as described
+ * above.
+ */
+ BROP(p)->b_exit_with_sig(p, sqp);
+ } else {
+ p->p_pidflag &= ~CLDPEND;
+ sigcld(p, sqp);
+ }
+
} else {
/*
* Do what sigcld() would do if the disposition
@@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
int
waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
{
- int found;
proc_t *cp, *pp;
- int proc_gone;
int waitflag = !(options & WNOWAIT);
+ boolean_t have_brand_helper = B_FALSE;
/*
* Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
pp = ttoproc(curthread);
/*
- * lock parent mutex so that sibling chain can be searched.
+ * Anytime you are looking for a process, you take pidlock to prevent
+ * things from changing as you look.
*/
mutex_enter(&pidlock);
@@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
return (ECHILD);
}
- while (pp->p_child != NULL) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+ have_brand_helper = B_TRUE;
+ }
+
+ while (pp->p_child != NULL || have_brand_helper) {
+ boolean_t brand_wants_wait = B_FALSE;
+ int proc_gone = 0;
+ int found = 0;
+
+ /*
+ * Give the brand a chance to return synthetic results from
+ * this waitid() call before we do the real thing.
+ */
+ if (have_brand_helper) {
+ int ret;
+
+ if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+ &brand_wants_wait, &ret) == 0) {
+ mutex_exit(&pidlock);
+ return (ret);
+ }
- proc_gone = 0;
+ if (pp->p_child == NULL) {
+ goto no_real_children;
+ }
+ }
+ /*
+ * Look for interesting children in the newstate list.
+ */
+ VERIFY(pp->p_child != NULL);
for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
continue;
@@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
@@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* Wow! None of the threads on the p_sibling_ns list were
* interesting threads. Check all the kids!
*/
- found = 0;
for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
if (idtype == P_PID && id != cp->p_pid)
continue;
if (idtype == P_PGID && id != cp->p_pgrp)
continue;
+ if (PROC_IS_BRANDED(pp)) {
+ if (BROP(pp)->b_wait_filter != NULL &&
+ BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+ continue;
+ }
switch (cp->p_wcode) {
case CLD_TRAPPED:
@@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
break;
}
+no_real_children:
/*
* If we found no interesting processes at all,
* break out and return ECHILD.
*/
- if (found + proc_gone == 0)
+ if (!brand_wants_wait && (found + proc_gone == 0))
break;
if (options & WNOHANG) {
@@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
* change state while we wait, we don't wait at all.
* Get out with ECHILD according to SVID.
*/
- if (found == proc_gone)
+ if (!brand_wants_wait && (found == proc_gone))
break;
if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1445,12 @@ freeproc(proc_t *p)
p->p_killsqp = NULL;
}
+ /* Clear any remaining brand data */
+ if (PROC_IS_BRANDED(p)) {
+ brand_clearbrand(p, B_FALSE);
+ }
+
+
prfree(p); /* inform /proc */
/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..41e7e63d2b 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc.
+ * Copyright 2017, Joyent Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -386,6 +386,7 @@ flist_grow(int maxfd)
dst->uf_flag = src->uf_flag;
dst->uf_busy = src->uf_busy;
dst->uf_portfd = src->uf_portfd;
+ dst->uf_gen = src->uf_gen;
}
/*
@@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */
afd->a_fd[i] = -1;
}
-static void
+void
set_active_fd(int fd)
{
afd_t *afd = &curthread->t_activefd;
@@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd)
}
/*
- * Convert a user supplied file descriptor into a pointer to a file
- * structure. Only task is to check range of the descriptor (soft
- * resource limit was enforced at open time and shouldn't be checked
- * here).
+ * Convert a user supplied file descriptor into a pointer to a file structure.
+ * Only task is to check range of the descriptor (soft resource limit was
+ * enforced at open time and shouldn't be checked here).
*/
file_t *
-getf(int fd)
+getf_gen(int fd, uf_entry_gen_t *genp)
{
uf_info_t *fip = P_FINFO(curproc);
uf_entry_t *ufp;
@@ -607,6 +607,9 @@ getf(int fd)
return (NULL);
}
ufp->uf_refcnt++;
+ if (genp != NULL) {
+ *genp = ufp->uf_gen;
+ }
set_active_fd(fd); /* record the active file descriptor */
@@ -615,6 +618,12 @@ getf(int fd)
return (fp);
}
+file_t *
+getf(int fd)
+{
+ return (getf_gen(fd, NULL));
+}
+
/*
* Close whatever file currently occupies the file descriptor slot
* and install the new file, usually NULL, in the file descriptor slot.
@@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp)
ASSERT(ufp->uf_flag == 0);
fd_reserve(fip, fd, 1);
ufp->uf_file = newfp;
+ ufp->uf_gen++;
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (0);
@@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
*/
cfip->fi_nfiles = nfiles = flist_minsize(pfip);
- cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+ cfip->fi_list = nfiles == 0 ? NULL :
+ kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
fd++, pufp++, cufp++) {
@@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
cufp->uf_alloc = pufp->uf_alloc;
cufp->uf_flag = pufp->uf_flag;
cufp->uf_busy = pufp->uf_busy;
+ cufp->uf_gen = pufp->uf_gen;
if (pufp->uf_file == NULL) {
ASSERT(pufp->uf_flag == 0);
if (pufp->uf_busy) {
@@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp)
fd_reserve(fip, fd, 1);
ASSERT(ufp->uf_file == NULL);
ufp->uf_file = fp;
+ if (fp != NULL) {
+ ufp->uf_gen++;
+ }
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
return (fd);
@@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp)
} else {
UF_ENTER(ufp, fip, fd);
ASSERT(ufp->uf_busy);
+ ufp->uf_gen++;
}
ASSERT(ufp->uf_fpollinfo == NULL);
ASSERT(ufp->uf_flag == 0);
@@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp)
error = EBADF;
else {
vnode_t *vp = fp->f_vnode;
- int flag = fp->f_flag |
- ((fp->f_flag2 & ~FEPOLLED) << 16);
+ int flag = fp->f_flag | (fp->f_flag2 << 16);
/*
* BSD fcntl() FASYNC compatibility.
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index a63931459f..7e198910b4 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
static int getproc(proc_t **, pid_t, uint_t);
#define GETPROC_USER 0x0
#define GETPROC_KERNEL 0x1
+#define GETPROC_ZSCHED 0x2
static void fork_fail(proc_t *);
static void forklwp_fail(proc_t *);
@@ -705,7 +706,7 @@ fork_fail(proc_t *cp)
if (PTOU(curproc)->u_cwd)
refstr_rele(PTOU(curproc)->u_cwd);
if (PROC_IS_BRANDED(cp)) {
- brand_clearbrand(cp, B_TRUE);
+ brand_clearbrand(cp, B_FALSE);
}
}
@@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)
kmem_free(t->t_door, sizeof (door_data_t));
t->t_door = NULL;
}
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
/*
* Remove the thread from the all threads list.
@@ -791,6 +792,9 @@ extern struct as kas;
/*
* fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
*/
int
newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
ASSERT(pid != 1);
+ ASSERT(pid >= 0);
if (getproc(&p, pid, GETPROC_KERNEL) < 0)
return (EAGAIN);
@@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
rctl_set_t *init_set;
task_t *tk, *tk_old;
klwp_t *lwp;
+ boolean_t pzsched = B_FALSE;
+ int flag = GETPROC_USER;
+
+ /* Handle a new user-level thread as child of zsched. */
+ if (pid < 0) {
+ VERIFY(curzone != global_zone);
+ flag = GETPROC_ZSCHED;
+ pzsched = B_TRUE;
+ pid = 0;
+ }
- if (getproc(&p, pid, GETPROC_USER) < 0)
+ if (getproc(&p, pid, flag) < 0)
return (EAGAIN);
/*
* init creates a new task, distinct from the task
@@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
}
t = lwptot(lwp);
- ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+ ctp = contract_process_fork(sys_process_tmpl, p,
+ (pzsched ? curproc->p_zone->zone_zsched : curproc),
B_FALSE);
ASSERT(ctp != NULL);
if (ct != NULL)
@@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
return (-1); /* no point in starting new processes */
- pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ if (flags & GETPROC_ZSCHED) {
+ pp = curproc->p_zone->zone_zsched;
+ } else {
+ pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+ }
task = pp->p_task;
proj = task->tk_proj;
zone = pp->p_zone;
@@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_t1_lgrpid = LGRP_NONE;
cp->p_tr_lgrpid = LGRP_NONE;
+ /* Default to native brand initially */
+ cp->p_brand = &native_brand;
+
if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
if (nproc == v.v_proc) {
CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
cp->p_sessp = pp->p_sessp;
sess_hold(pp);
- cp->p_brand = pp->p_brand;
- if (PROC_IS_BRANDED(pp))
- BROP(pp)->b_copy_procdata(cp, pp);
cp->p_bssbase = pp->p_bssbase;
cp->p_brkbase = pp->p_brkbase;
cp->p_brksize = pp->p_brksize;
@@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
mutex_exit(&cp->p_lock);
mutex_exit(&pidlock);
+ if (PROC_IS_BRANDED(pp)) {
+ /*
+ * The only reason why process branding should fail is when
+ * the procedure is complicated by multiple LWPs on the scene.
+ * With an LWP count of 0, this newly allocated process has no
+ * reason to fail branding.
+ */
+ VERIFY0(brand_setbrand(cp, B_FALSE));
+
+ BROP(pp)->b_copy_procdata(cp, pp);
+ }
+
avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
offsetof(contract_t, ct_ctlist));
@@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
*/
fcnt_add(P_FINFO(pp), 1);
+ mutex_enter(&pp->p_lock);
if (PTOU(pp)->u_cdir) {
VN_HOLD(PTOU(pp)->u_cdir);
} else {
@@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
VN_HOLD(PTOU(pp)->u_rdir);
if (PTOU(pp)->u_cwd)
refstr_hold(PTOU(pp)->u_cwd);
+ mutex_exit(&pp->p_lock);
/*
* copy the parent's uarea.
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index de2a4f26c4..07fd623a95 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -21,7 +21,7 @@
/*
* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -55,6 +55,7 @@
#include <sys/fcntl.h>
#include <sys/lwpchan_impl.h>
#include <sys/nbmlock.h>
+#include <sys/brand.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
return (0);
}
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+ if (flags & _MAP_LOW32) {
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+ return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+ } else {
+ return ((caddr_t)_userlimit32);
+ }
+ }
+
+ return (as->a_userlimit);
+}
+
/*
* Used for MAP_ANON - fast way to get anonymous pages
@@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
return (EACCES);
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(as->a_proc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
@@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
#define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
-static int
+int
smmap_common(caddr_t *addrp, size_t len,
int prot, int flags, struct file *fp, offset_t pos)
{
@@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len,
* If the user specified an address, do some simple checks here
*/
if ((flags & MAP_FIXED) != 0) {
- caddr_t userlimit;
-
/*
* Use the user address. First verify that
* the address to be used is page aligned.
@@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len,
*/
if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
return (EINVAL);
-
- userlimit = flags & _MAP_LOW32 ?
- (caddr_t)USERLIMIT32 : as->a_userlimit;
- switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+ switch (valid_usr_range(*addrp, len, uprot, as,
+ map_userlimit(curproc, as, flags))) {
case RANGE_OKAY:
break;
case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- * The id_space_t provides a simple implementation of a managed range of
- * integer identifiers using a vmem arena. An ID space guarantees that the
- * next identifer returned by an allocation is larger than the previous one,
- * unless there are no larger slots remaining in the range. In this case,
- * the ID space will return the first available slot in the lower part of the
- * range (viewing the previous identifier as a partitioning element). If no
- * slots are available, id_alloc()/id_allocff() will sleep until an
- * identifier becomes available. Accordingly, id_space allocations must be
- * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/
- * id_allocff_nosleep() will return -1 if no slots are available or if the
- * system is low on memory. If id_alloc_nosleep() fails, callers should
- * not try to extend the ID space. This is to avoid making a possible
- * low-memory situation worse.
- *
- * As an ID space is designed for representing a range of id_t's, there
- * is a preexisting maximal range: [0, MAXUID]. ID space requests outside
- * that range will fail on a DEBUG kernel. The id_allocff*() functions
- * return the first available id, and should be used when there is benefit
- * to having a compact allocated range.
- *
- * (Presently, the id_space_t abstraction supports only direct allocations; ID
- * reservation, in which an ID is allocated but placed in a internal
- * dictionary for later use, should be added when a consuming subsystem
- * arrives.)
- */
-
-#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
- ASSERT(low >= 0);
- ASSERT(low < high);
-
- return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
- NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
- vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
- (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
- return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
- void *minaddr = ID_TO_ADDR(id);
- void *maxaddr = ID_TO_ADDR(id + 1);
-
- /*
- * Note that even though we're vmem_free()ing this later, it
- * should be OK, since there's no quantum cache.
- */
- return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
- minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
- vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 9381019cd1..6a6f5d84ef 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
(IPC_ZONE_USAGE(perm, service) == 0)));
}
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+ ASSERT(service->ipcs_count > 0);
+ ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+ ipc_remove(service, perm);
+ mutex_exit(&service->ipcs_lock);
+
+ /* perform any per-service removal actions */
+ service->ipcs_rmid(perm);
+
+ ipc_rele(service, perm);
+}
/*
* Common code to perform an IPC_RMID. Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
/*
* Nothing can fail from this point on.
*/
- ipc_remove(service, perm);
- mutex_exit(&service->ipcs_lock);
-
- /* perform any per-service removal actions */
- service->ipcs_rmid(perm);
-
- ipc_rele(service, perm);
+ ipc_rmsvc(service, perm);
return (0);
}
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index b41ab8c465..9a3692053d 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2018, Joyent, Inc.
@@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
size_t kmem_content_log_size; /* content log size [2% of memory] */
size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
+#ifdef DEBUG
+int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */
+
#ifdef _LP64
size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
#else
@@ -1098,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log;
kmem_log_header_t *kmem_content_log;
kmem_log_header_t *kmem_failure_log;
kmem_log_header_t *kmem_slab_log;
+kmem_log_header_t *kmem_zerosized_log;
static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
@@ -2851,8 +2862,33 @@ kmem_alloc(size_t size, int kmflag)
/* fall through to kmem_cache_alloc() */
} else {
- if (size == 0)
+ if (size == 0) {
+ if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+ return (NULL);
+
+ /*
+ * If this is a sleeping allocation or one that has
+ * been specified to panic on allocation failure, we
+ * consider it to be deprecated behavior to allocate
+ * 0 bytes. If we have been configured to panic under
+ * this condition, we panic; if to warn, we warn -- and
+ * regardless, we log to the kmem_zerosized_log that
+ * that this condition has occurred (which gives us
+ * enough information to be able to debug it).
+ */
+ if (kmem_panic && kmem_panic_zerosized)
+ panic("attempted to kmem_alloc() size of 0");
+
+ if (kmem_warn_zerosized) {
+ cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+ "allocation with size of 0; "
+ "see kmem_zerosized_log for details");
+ }
+
+ kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
return (NULL);
+ }
buf = vmem_alloc(kmem_oversize_arena, size,
kmflag & KM_VMFLAGS);
@@ -4392,8 +4428,8 @@ kmem_init(void)
}
kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+ kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
/*
* Initialize STREAMS message caches so allocb() is available.
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 93c04cff8d..b09b2d3558 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -198,6 +198,9 @@ struct {
kstat_named_t pagesfree;
kstat_named_t pageslocked;
kstat_named_t pagestotal;
+ kstat_named_t lowmemscan;
+ kstat_named_t zonecapscan;
+ kstat_named_t nthrottle;
} system_pages_kstat = {
{ "physmem", KSTAT_DATA_ULONG },
{ "nalloc", KSTAT_DATA_ULONG },
@@ -219,6 +222,9 @@ struct {
{ "pagesfree", KSTAT_DATA_ULONG },
{ "pageslocked", KSTAT_DATA_ULONG },
{ "pagestotal", KSTAT_DATA_ULONG },
+ { "low_mem_scan", KSTAT_DATA_ULONG },
+ { "zone_cap_scan", KSTAT_DATA_ULONG },
+ { "n_throttle", KSTAT_DATA_ULONG },
};
static int header_kstat_update(kstat_t *, int);
@@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)
system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial -
availrmem);
system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages;
+ system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan;
+ system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan;
+ system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle;
/*
* pp_kernel represents total pages used by the kernel since the
* startup. This formula takes into account the boottime kernel
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -90,6 +91,7 @@
#include <sys/pg.h>
#include <sys/promif.h>
#include <sys/sdt.h>
+#include <sys/ht.h>
lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */
lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
{
klgrpset_t changed;
+ ht_init();
+
/*
* Update lgroup topology (if necessary)
*/
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..06c03dd38e 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2013 Gary Mills
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
*/
printf("\rSunOS Release %s Version %s %u-bit\n",
utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
- printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
- "All rights reserved.\n");
+ printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n");
#ifdef DEBUG
printf("DEBUG enabled\n");
#endif
@@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc)
mblk_t *
log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg,
- size_t size, int on_intr)
+ size_t size, int on_intr)
{
mblk_t *mp = NULL;
mblk_t *mp2;
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index b2adae570f..341e4ae356 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/param.h>
@@ -57,6 +57,8 @@
#include <sys/lgrp.h>
#include <sys/rctl.h>
#include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
#include <sys/cpc_impl.h>
#include <sys/sdt.h>
#include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
ret_tidhash_t *ret_tidhash = NULL;
int i;
int rctlfail = 0;
- boolean_t branded = 0;
+ void *brand_data = NULL;
struct ctxop *ctx = NULL;
ASSERT(cid != sysdccid); /* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
*/
lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
+ /*
+ * If necessary, speculatively allocate lwp brand data. This is done
+ * ahead of time so p_lock need not be dropped during lwp branding.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+ if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+ mutex_enter(&p->p_lock);
+ err = 1;
+ atomic_inc_32(&p->p_zone->zone_ffmisc);
+ goto error;
+ }
+ }
+
mutex_enter(&p->p_lock);
grow:
/*
@@ -630,18 +645,6 @@ grow:
} while (lwp_hash_lookup(p, t->t_tid) != NULL);
}
- /*
- * If this is a branded process, let the brand do any necessary lwp
- * initialization.
- */
- if (PROC_IS_BRANDED(p)) {
- if (BROP(p)->b_initlwp(lwp)) {
- err = 1;
- atomic_inc_32(&p->p_zone->zone_ffmisc);
- goto error;
- }
- branded = 1;
- }
if (t->t_tid == 1) {
kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
}
}
- p->p_lwpcnt++;
t->t_waitfor = -1;
/*
@@ -696,8 +698,27 @@ grow:
t->t_post_sys = 1;
/*
+ * Perform lwp branding
+ *
+ * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+ * continuously held between when the tidhash is sized and when the lwp
+ * is inserted into it. Operations requiring p->p_lock to be
+ * temporarily dropped can be performed in b_initlwp_post.
+ */
+ if (PROC_IS_BRANDED(p)) {
+ BROP(p)->b_initlwp(lwp, brand_data);
+ /*
+ * The b_initlwp hook is expected to consume any preallocated
+ * brand_data in a way that prepares it for deallocation by the
+ * b_freelwp hook.
+ */
+ brand_data = NULL;
+ }
+
+ /*
* Insert the new thread into the list of all threads.
*/
+ p->p_lwpcnt++;
if ((tx = p->p_tlist) == NULL) {
t->t_back = t;
t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
lep->le_start = t->t_start;
lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
+ /*
+ * Complete lwp branding
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+ BROP(p)->b_initlwp_post(lwp);
+ }
+
lwp_fp_init(lwp);
if (state == TS_RUN) {
@@ -755,8 +783,9 @@ error:
if (cid != NOCLASS && bufp != NULL)
CL_FREE(cid, bufp);
- if (branded)
- BROP(p)->b_freelwp(lwp);
+ if (brand_data != NULL) {
+ BROP(p)->b_lwpdata_free(brand_data);
+ }
mutex_exit(&p->p_lock);
t->t_state = TS_FREE;
@@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
int i;
for (i = 0; i < ct_ntypes; i++) {
- dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+ ct_template_t *tmpl = src->lwp_ct_active[i];
+
+ /*
+ * If the process contract template is setup to be preserved
+ * across exec, then if we're forking, perform an implicit
+ * template_clear now. This ensures that future children of
+ * this child will remain in the same contract unless they're
+ * explicitly setup differently. We know we're forking if the
+ * two LWPs belong to different processes.
+ */
+ if (i == CTT_PROCESS && tmpl != NULL) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if (dst->lwp_procp != src->lwp_procp &&
+ (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ tmpl = NULL;
+ }
+
+ dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
dst->lwp_ct_latest[i] = NULL;
+
}
}
@@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
* Clear an LWP's contract template state.
*/
void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
{
ct_template_t *tmpl;
int i;
for (i = 0; i < ct_ntypes; i++) {
- if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
- ctmpl_free(tmpl);
- lwp->lwp_ct_active[i] = NULL;
- }
-
if (lwp->lwp_ct_latest[i] != NULL) {
contract_rele(lwp->lwp_ct_latest[i]);
lwp->lwp_ct_latest[i] = NULL;
}
+
+ if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+ /*
+ * If we're exec-ing a new program and the process
+ * contract template is setup to be preserved across
+ * exec, then don't clear it.
+ */
+ if (is_exec && i == CTT_PROCESS) {
+ ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+ if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+ continue;
+ }
+
+ ctmpl_free(tmpl);
+ lwp->lwp_ct_active[i] = NULL;
+ }
}
}
@@ -893,13 +953,6 @@ lwp_exit(void)
if (t->t_upimutex != NULL)
upimutex_cleanup();
- /*
- * Perform any brand specific exit processing, then release any
- * brand data associated with the lwp
- */
- if (PROC_IS_BRANDED(p))
- BROP(p)->b_lwpexit(lwp);
-
lwp_pcb_exit();
mutex_enter(&p->p_lock);
@@ -943,6 +996,18 @@ lwp_exit(void)
DTRACE_PROC(lwp__exit);
/*
+ * Perform any brand specific exit processing, then release any
+ * brand data associated with the lwp
+ */
+ if (PROC_IS_BRANDED(p)) {
+ mutex_exit(&p->p_lock);
+ BROP(p)->b_lwpexit(lwp);
+ BROP(p)->b_freelwp(lwp);
+ mutex_enter(&p->p_lock);
+ prbarrier(p);
+ }
+
+ /*
* If the lwp is a detached lwp or if the process is exiting,
* remove (lwp_hash_out()) the lwp from the lwp directory.
* Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1103,7 +1168,7 @@ lwp_cleanup(void)
}
kpreempt_enable();
- lwp_ctmpl_clear(ttolwp(t));
+ lwp_ctmpl_clear(ttolwp(t), B_FALSE);
}
int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index ec61ad5c76..db6d74b2c2 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args)
int error = 0, count = 0;
proc_t *p = ttoproc(curthread);
klwp_t *lwp = ttolwp(curthread);
- int brand_action;
+ int brand_action = EBA_NONE;
if (args == NULL)
args = "";
@@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args)
*/
sigemptyset(&curthread->t_hold);
- brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+ /*
+ * Only instruct exec_common to brand the process if necessary. It is
+ * possible that the init process is already properly branded due to the
+ * proc_exit -> restart_init -> exec_init call chain.
+ */
+ if (ZONE_IS_BRANDED(p->p_zone) &&
+ p->p_brand != p->p_zone->zone_brand) {
+ brand_action = EBA_BRAND;
+ }
again:
error = exec_common((const char *)exec_fnamep,
(const char **)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 3571747e9c..6be46fa422 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp)
* Put pressure on pageout.
*/
page_needfree(free_get);
- cv_signal(&proc_pageout->p_cv);
+ WAKE_PAGEOUT_SCANNER();
mutex_enter(&mhp->mh_mutex);
(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 142c10754e..0410e6f47b 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
}
if (num_segs++ == 0) {
/*
- * The p_vaddr of the first PT_LOAD segment
- * must either be NULL or within the first
- * page in order to be interpreted.
- * Otherwise, its an invalid file.
+ * While ELF doesn't specify the meaning of
+ * p_vaddr for PT_LOAD segments in ET_DYN
+ * objects, we mandate that is either NULL or
+ * (to accommodate some historical binaries)
+ * within the first page. (Note that there
+ * exist non-native ET_DYN objects that violate
+ * this constraint that we nonetheless must be
+ * able to execute; see the ET_DYN handling in
+ * mapelfexec() for details.)
*/
if (e_type == ET_DYN &&
((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c
index 35162eb558..c6e9d89d0d 100644
--- a/usr/src/uts/common/os/modctl.c
+++ b/usr/src/uts/common/os/modctl.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 Joyent, Inc.
*/
/*
@@ -3469,6 +3470,11 @@ mod_load(struct modctl *mp, int usepath)
retval = install_stubs_by_name(mp, mp->mod_modname);
/*
+ * Perform hotinlines before module is started.
+ */
+ do_hotinlines(mp->mod_mp);
+
+ /*
* Now that the module is loaded, we need to give DTrace
* a chance to notify its providers. This is done via
* the dtrace_modload function pointer.
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 8dca86880f..37ac089edf 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -23,6 +23,7 @@
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -57,10 +58,12 @@ struct hwc_class *hcl_head; /* head of list of classes */
static kmutex_t hcl_lock; /* for accessing list of classes */
#define DAFILE "/etc/driver_aliases"
+#define PPTFILE "/etc/ppt_aliases"
#define CLASSFILE "/etc/driver_classes"
#define DACFFILE "/etc/dacf.conf"
static char class_file[] = CLASSFILE;
+static char pptfile[] = PPTFILE;
static char dafile[] = DAFILE;
static char dacffile[] = DACFFILE;
@@ -2136,14 +2139,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props)
return (0); /* always return success */
}
-void
-make_aliases(struct bind **bhash)
+static void
+parse_aliases(struct bind **bhash, struct _buf *file)
{
enum {
AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA
} state;
- struct _buf *file;
char tokbuf[MAXPATHLEN];
char drvbuf[MAXPATHLEN];
token_t token;
@@ -2152,9 +2154,6 @@ make_aliases(struct bind **bhash)
static char dupwarn[] = "!Driver alias \"%s\" conflicts with "
"an existing driver name or alias.";
- if ((file = kobj_open_file(dafile)) == (struct _buf *)-1)
- return;
-
state = AL_NEW;
major = DDI_MAJOR_T_NONE;
while (!done) {
@@ -2239,8 +2238,22 @@ make_aliases(struct bind **bhash)
kobj_file_err(CE_WARN, file, tok_err, tokbuf);
}
}
+}
- kobj_close_file(file);
+void
+make_aliases(struct bind **bhash)
+{
+ struct _buf *file;
+
+ if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) {
+ parse_aliases(bhash, file);
+ kobj_close_file(file);
+ }
+
+ if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) {
+ parse_aliases(bhash, file);
+ kobj_close_file(file);
+ }
}
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
return (pidp);
}
+struct pid *
+pid_find(pid_t pid)
+{
+ struct pid *pidp;
+
+ mutex_enter(&pidlinklock);
+ pidp = pid_lookup(pid);
+ mutex_exit(&pidlinklock);
+
+ return (pidp);
+}
+
void
pid_setmin(void)
{
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
THREAD_KPRI_RELEASE();
}
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ THREAD_KPRI_RELEASE();
+}
+
void
pid_init(void)
{
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index d6821c83b0..8cc7f009a3 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -56,6 +56,7 @@
#include <sys/mntent.h>
#include <sys/contract_impl.h>
#include <sys/dld_ioc.h>
+#include <sys/brand.h>
/*
* There are two possible layers of privilege routines and two possible
@@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
void
secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
{
+ proc_t *p = curproc;
+
+ /*
+ * Allow the brand to override this behaviour.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+ /*
+ * This brand hook will return 0 if handling is complete, or
+ * some other value if the brand would like us to fall back to
+ * the usual behaviour.
+ */
+ if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+ return;
+ }
+ }
+
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(cr,
(vap->va_mode & S_ISUID) != 0 &&
@@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)
}
int
+secpolicy_fs_import(const cred_t *cr)
+{
+ return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
secpolicy_pfexec_register(const cred_t *cr)
{
return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)
return (secpolicy_net_config(cr, B_FALSE));
return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
}
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+ if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+ return (EPERM);
+ return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index bc1787c9ca..854fb602da 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
Allows a process to perform privileged mappings through a
graphics device.
+privilege PRIV_HYPRLOFS_CONTROL
+
+ Allows a process to manage hyprlofs entries.
+
privilege PRIV_IPC_DAC_READ
Allows a process to read a System V IPC
@@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES
Allows a process to open the real console device directly.
Allows a process to open devices that have been exclusively opened.
+privilege PRIV_SYS_FS_IMPORT
+
+ Allows a process to import a potentially untrusted file system.
+
privilege PRIV_SYS_IPC_CONFIG
Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 09b80323d5..e0a1126567 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
#include <sys/atomic.h>
@@ -194,6 +195,8 @@ id_space_t *rctl_ids;
kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */
kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */
+extern rctl_hndl_t rc_process_maxlockedmem;
+
kmutex_t rctl_lists_lock;
rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
@@ -2872,12 +2875,12 @@ rctl_init(void)
* rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
* int chargeproc)
*
- * Increments the amount of locked memory on a project, and
- * zone. If proj is non-NULL the project must be held by the
- * caller; if it is NULL the proj and zone of proc_t p are used.
- * If chargeproc is non-zero, then the charged amount is cached
- * on p->p_locked_mem so that the charge can be migrated when a
- * process changes projects.
+ * Increments the amount of locked memory on a process, project, and
+ * zone. If 'proj' is non-NULL, the project must be held by the
+ * caller; if it is NULL, the project and zone of process 'p' are used.
+ * If 'chargeproc' is non-zero, then the charged amount is added
+ * to p->p_locked_mem. This is also used so that the charge can be
+ * migrated when a process changes projects.
*
* Return values
* 0 - success
@@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
ASSERT(p != NULL);
ASSERT(MUTEX_HELD(&p->p_lock));
+
if (proj != NULL) {
projp = proj;
zonep = proj->kpj_zone;
@@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
}
}
- zonep->zone_locked_mem += inc;
- projp->kpj_data.kpd_locked_mem += inc;
if (chargeproc != 0) {
+ /* Check for overflow */
+ if ((p->p_locked_mem + inc) < p->p_locked_mem) {
+ ret = EAGAIN;
+ goto out;
+ }
+ if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p,
+ &e, inc, 0) & RCT_DENY) {
+ ret = EAGAIN;
+ goto out;
+ }
+
p->p_locked_mem += inc;
}
+
+ zonep->zone_locked_mem += inc;
+ projp->kpj_data.kpd_locked_mem += inc;
out:
mutex_exit(&zonep->zone_mem_lock);
return (ret);
diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c
index 9b7324fe7b..c62540d2b4 100644
--- a/usr/src/uts/common/os/rctl_proc.c
+++ b/usr/src/uts/common/os/rctl_proc.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -32,6 +33,7 @@
#include <sys/port_kernel.h>
#include <sys/signal.h>
#include <sys/var.h>
+#include <sys/policy.h>
#include <sys/vmparam.h>
#include <sys/machparam.h>
@@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;
rctl_hndl_t rc_process_semopm;
rctl_hndl_t rc_process_portev;
rctl_hndl_t rc_process_sigqueue;
+rctl_hndl_t rc_process_maxlockedmem;
/*
* process.max-cpu-time / RLIMIT_CPU
@@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {
};
/*
+ * process.max-locked-memory
+ */
+/*ARGSUSED*/
+static int
+proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
+ struct rctl_val *rv, rctl_qty_t i, uint_t f)
+{
+ if (secpolicy_lock_memory(CRED()) == 0)
+ return (0);
+ return ((p->p_locked_mem + i) > rv->rcv_value);
+}
+
+static rctl_ops_t proc_maxlockedmem_ops = {
+ rcop_no_action,
+ rcop_no_usage,
+ rcop_no_set,
+ proc_maxlockedmem_test
+};
+
+/*
* void rctlproc_default_init()
*
* Overview
@@ -383,6 +406,11 @@ rctlproc_init(void)
rctl_add_default_limit("process.max-sigqueue-size",
_SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
+ rc_process_maxlockedmem = rctl_register("process.max-locked-memory",
+ RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS |
+ RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES,
+ ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops);
+
/*
* Place minimal set of controls on "sched" process for inheritance by
* processes created via newproc().
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
klwp_t *lwp = ttolwp(tp);
/*
- * Swapout eligible lwps (specified by the scheduling
- * class) which don't have TS_DONT_SWAP set. Set the
- * "intent to swap" flag (TS_SWAPENQ) on threads
- * which have TS_DONT_SWAP set so that they can be
+ * Swapout eligible lwps (specified by the scheduling class)
+ * which don't have TS_DONT_SWAP set. Set the "intent to swap"
+ * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+ * set or are currently on a split stack so that they can be
* swapped if and when they reach a safe point.
*/
thread_lock(tp);
thread_pri = CL_SWAPOUT(tp, swapflags);
if (thread_pri != -1) {
- if (tp->t_schedflag & TS_DONT_SWAP) {
+ if ((tp->t_schedflag & TS_DONT_SWAP) ||
+ (tp->t_flag & T_SPLITSTK)) {
tp->t_schedflag |= TS_SWAPENQ;
tp->t_trapret = 1;
aston(tp);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 5721083751..18b396a765 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)
/*
- * If the sc_sigblock field is set for the specified thread, set
- * its signal mask to block all maskable signals, then clear the
- * sc_sigblock field. This finishes what user-level code requested
- * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
- * Called from signal-related code either by the current thread for
- * itself or by a thread that holds the process's p_lock (/proc code).
+ * If the sc_sigblock field is set for the specified thread, set its signal
+ * mask to block all maskable signals, then clear the sc_sigblock field. This
+ * accomplishes what user-level code requested to be done when it set
+ * tdp->sc_shared->sc_sigblock non-zero.
+ *
+ * This is generally called by signal-related code in the current thread. In
+ * order to call against a thread other than curthread, p_lock for the
+ * containing process must be held. Even then, the caller is not protected
+ * from races with the thread in question updating its own fields. It is the
+ * responsibility of the caller to perform additional synchronization.
+ *
*/
void
schedctl_finish_sigblock(kthread_t *t)
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index bacc595f78..5deae96d73 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
size_t share_size;
struct shm_data ssd;
uintptr_t align_hint;
+ long curprot;
/*
* Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
}
+ curprot = sp->shm_opts & SHM_PROT_MASK;
if (!isspt(sp)) {
error = sptcreate(size, &segspt, sp->shm_amp, prot,
flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
}
sp->shm_sptinfo->sptas = segspt->s_as;
sp->shm_sptseg = segspt;
- sp->shm_sptprot = prot;
- } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+ sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+ } else if ((prot & curprot) != curprot) {
/*
* Ensure we're attaching to an ISM segment with
* fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
}
break;
+ /* Stage segment for removal, but don't remove until last detach */
+ case SHM_RMID:
+ if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+ break;
+
+ /*
+ * If attached, just mark it as a pending remove, otherwise
+ * we must perform the normal ipc_rmid now.
+ */
+ if ((sp->shm_perm.ipc_ref - 1) > 0) {
+ sp->shm_opts |= SHM_RM_PENDING;
+ } else {
+ mutex_exit(lock);
+ return (ipc_rmid(shm_svc, shmid, cr));
+ }
+ break;
+
default:
error = EINVAL;
break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
sp->shm_ismattch--;
sp->shm_dtime = gethrestime_sec();
sp->shm_lpid = pp->p_pid;
+ if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+ sp->shm_perm.ipc_ref == 2) {
+ /*
+ * If this is the last detach of the segment across the whole
+ * system then now we can perform the delayed IPC_RMID.
+ * The ipc_ref count has 1 for the original 'get' and one for
+ * each 'attach' (see 'stat' handling in shmctl).
+ */
+ sp->shm_opts &= ~SHM_RM_PENDING;
+ mutex_enter(&shm_svc->ipcs_lock);
+ ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
+ ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+ ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+ /* Lock was dropped, need to retake it for following rele. */
+ (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+ }
ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */
kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..67a93581dd 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -60,6 +60,7 @@
#include <sys/cyclic.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <sys/signalfd.h>
const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
}
/*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+ return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */
+ !(PROC_IS_BRANDED(p) && /* allowed by brand */
+ BROP(p)->b_sig_ignorable != NULL &&
+ BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
* Return true if the signal can safely be discarded on generation.
* That is, if there is no need for the signal on the receiving end.
* The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
* the signal is not being accepted via sigwait()
*/
static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
{
kthread_t *t = p->p_tlist;
+ klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
return (t == NULL || /* if zombie or ... */
- (sigismember(&p->p_ignore, sig) && /* signal is ignored */
+ (sig_ignorable(p, lwp, sig) && /* signal is ignored */
t->t_forw == t && /* and single-threaded */
!tracing(p, sig) && /* and no /proc tracing */
!signal_is_blocked(t, sig) && /* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
!(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
ttoproc(t)->p_stopsig = 0;
t->t_dtrace_stop = 0;
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
} else if (t != curthread && t->t_state == TS_ONPROC) {
aston(t); /* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
}
}
- if (sig_discardable(p, sig)) {
+ if (sig_discardable(p, t, sig)) {
DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
proc_t *, p, int, sig);
return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
if (sigismember(&set, sig) &&
(tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig))) {
+ !sig_ignorable(p, lwp, sig))) {
/*
* Don't promote a signal that will stop
* the process when lwp_nostop is set.
@@ -623,6 +640,28 @@ issig_forreal(void)
}
/*
+ * The brand hook name 'b_issig_stop' is a misnomer.
+ * Allow the brand the chance to alter (or suppress) delivery
+ * of this signal.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+ int r;
+
+ /*
+ * The brand hook will return 0 if it would like
+ * us to drive on, -1 if we should restart
+ * the loop to check other conditions, or 1 if we
+ * should terminate the loop.
+ */
+ r = BROP(p)->b_issig_stop(p, lwp);
+ if (r < 0) {
+ continue;
+ } else if (r > 0) {
+ break;
+ }
+ }
+
+ /*
* Honor requested stop before dealing with the
* current signal; a debugger may change it.
* Do not want to go back to loop here since this is a special
@@ -656,7 +695,7 @@ issig_forreal(void)
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
if (sigismember(&t->t_sigwait, sig) ||
- (!sigismember(&p->p_ignore, sig) &&
+ (!sig_ignorable(p, lwp, sig) &&
!isjobstop(sig))) {
if (p->p_flag & (SEXITLWPS|SKILLED)) {
sig = SIGKILL;
@@ -708,7 +747,7 @@ issig_forreal(void)
toproc = 0;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&t->t_extsig, sig))
ext = 1;
break;
@@ -722,7 +761,7 @@ issig_forreal(void)
toproc = 1;
if (tracing(p, sig) ||
sigismember(&t->t_sigwait, sig) ||
- !sigismember(&p->p_ignore, sig)) {
+ !sig_ignorable(p, lwp, sig)) {
if (sigismember(&p->p_extsig, sig))
ext = 1;
break;
@@ -954,6 +993,16 @@ stop(int why, int what)
}
break;
+ case PR_BRAND:
+ /*
+ * We have been stopped by the brand code for a brand-private
+ * reason. This is an asynchronous stop affecting only this
+ * LWP.
+ */
+ VERIFY(PROC_IS_BRANDED(p));
+ flags &= ~TS_BSTART;
+ break;
+
default: /* /proc stop */
flags &= ~TS_PSTART;
/*
@@ -1065,7 +1114,7 @@ stop(int why, int what)
}
}
- if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+ if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
/*
* Do process-level notification when all lwps are
* either stopped on events of interest to /proc
@@ -1171,6 +1220,13 @@ stop(int why, int what)
if (why == PR_CHECKPOINT)
del_one_utstop();
+ /*
+ * Allow the brand to post notification of this stop condition.
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+ BROP(p)->b_stop_notify(p, lwp, why, what);
+ }
+
thread_lock(t);
ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
t->t_schedflag |= flags;
@@ -1192,7 +1248,7 @@ stop(int why, int what)
(p->p_flag & (SEXITLWPS|SKILLED))) {
p->p_stopsig = 0;
thread_lock(t);
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
setrun_locked(t);
thread_unlock_nopreempt(t);
} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1383,7 @@ psig(void)
* this signal from pending to current (we dropped p->p_lock).
* This can happen only in a multi-threaded process.
*/
- if (sigismember(&p->p_ignore, sig) ||
+ if (sig_ignorable(p, lwp, sig) ||
(func == SIG_DFL && sigismember(&stopdefault, sig))) {
lwp->lwp_cursig = 0;
lwp->lwp_extsig = 0;
@@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
/*
* This can only happen when the parent is init.
* (See call to sigcld(q, NULL) in exit().)
- * Use KM_NOSLEEP to avoid deadlock.
+ * Use KM_NOSLEEP to avoid deadlock. The child procs
+ * initpid can be 1 for zlogin.
*/
- ASSERT(pp == proc_init);
+ ASSERT(pp->p_pidp->pid_id ==
+ cp->p_zone->zone_proc_initpid ||
+ pp->p_pidp->pid_id == 1);
winfo(cp, &info, 0);
sigaddq(pp, NULL, &info, KM_NOSLEEP);
} else {
@@ -1804,6 +1863,15 @@ sigcld_repost()
sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
mutex_enter(&pidlock);
+ if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+ /*
+ * Allow the brand to inject synthetic SIGCLD signals.
+ */
+ if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+ mutex_exit(&pidlock);
+ return;
+ }
+ }
for (cp = pp->p_child; cp; cp = cp->p_sibling) {
if (cp->p_pidflag & CLDPEND) {
post_sigcld(cp, sqp);
@@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(sig >= 1 && sig < NSIG);
- if (sig_discardable(p, sig))
+ if (sig_discardable(p, t, sig))
siginfofree(sigqp);
else
sigaddqins(p, t, sigqp);
@@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
* blocking the signal (it *could* change it's mind while
* the signal is pending) then don't bother creating one.
*/
- if (!sig_discardable(p, sig) &&
+ if (!sig_discardable(p, t, sig) &&
(sigismember(&p->p_siginfo, sig) ||
(curproc->p_ct_process != p->p_ct_process) ||
(sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
#include <sys/smbios_impl.h>
#include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
void *
smb_alloc(size_t len)
{
- return (kmem_alloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
}
void *
smb_zalloc(size_t len)
{
- return (kmem_zalloc(len, KM_SLEEP));
+ return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
}
void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 1786769cfb..1f9ceee188 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -78,6 +78,7 @@
#include <sys/policy.h>
#include <sys/dld.h>
#include <sys/zone.h>
+#include <sys/limits.h>
#include <c2/audit.h>
/*
@@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
* (registered in sd_wakeq).
*/
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
if (first)
stp->sd_wakeq &= ~RSLEEP;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_mp = 0;
/*
* Mark that a thread is in rwnext on the read side
@@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
if ((bp = uiod.d_mp) != NULL) {
*errorp = 0;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (bp);
}
error = 0;
@@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
} else {
*errorp = error;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (NULL);
}
+
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
+
/*
* Try a getq in case a rwnext() generated mblk
* has bubbled up via strrput().
@@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
int b_flag, int pri, int flags)
{
struiod_t uiod;
+ struct iovec buf[IOV_MAX_STACK];
+ int iovlen = 0;
mblk_t *mp;
queue_t *wqp = stp->sd_wrq;
int error = 0;
@@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
mp->b_flag |= b_flag;
mp->b_band = (uchar_t)pri;
- (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
- sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+ if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+ iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+ uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+ } else {
+ uiod.d_iov = buf;
+ }
+
+ (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
uiod.d_uio.uio_offset = 0;
uiod.d_mp = mp;
error = rwnext(wqp, &uiod);
if (! uiod.d_mp) {
uioskip(uiop, *iosize);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
ASSERT(mp == uiod.d_mp);
@@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
error = 0;
} else {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
/* Have to check canput before consuming data from the uio */
if (pri == 0) {
if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
} else {
if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (EWOULDBLOCK);
}
}
@@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
/* Copyin data from the uio */
if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
freemsg(mp);
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (error);
}
uioskip(uiop, *iosize);
@@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
putnext(wqp, mp);
stream_runservice(stp);
}
+ if (iovlen != 0)
+ kmem_free(uiod.d_iov, iovlen);
return (0);
}
@@ -3179,6 +3216,7 @@ job_control_type(int cmd)
case JAGENT: /* Obsolete */
case JTRUN: /* Obsolete */
case JXTPROTO: /* Obsolete */
+ case TIOCSETLD:
return (JCSETP);
}
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 8cc27df4eb..959e5576f0 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -26,6 +26,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -8461,6 +8462,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src)
dbp->db_cpid = cpid;
}
+
+/*
+ * Now that NIC drivers are expected to deal only with M_DATA mblks, the
+ * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their
+ * respective mac_hcksum_set and mac_hcksum_get counterparts.
+ */
int
hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
uint32_t start, uint32_t stuff, uint32_t end, uint32_t value,
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index ede7da413b..b1727729de 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -5903,6 +5903,12 @@ ddi_ffs(long mask)
return (ffs(mask));
}
+int
+ddi_ffsll(long long mask)
+{
+ return (ffs(mask));
+}
+
/*
* Find last bit set. Take mask and clear
* all but the most significant bit, and
@@ -5914,8 +5920,14 @@ ddi_ffs(long mask)
int
ddi_fls(long mask)
{
+ return (ddi_flsll(mask));
+}
+
+int
+ddi_flsll(long long mask)
+{
while (mask) {
- long nx;
+ long long nx;
if ((nx = (mask & (mask - 1))) == 0)
break;
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index f1b6f2616c..554ba1b881 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -23,6 +23,7 @@
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2018, Joyent, Inc.
*/
@@ -61,8 +62,7 @@ struct mmaplf32a;
int access(char *, int);
int alarm(int);
int auditsys(struct auditcalls *, rval_t *);
-int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
- uintptr_t);
+int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
intptr_t brk(caddr_t);
int chdir(char *);
int chmod(char *, int);
@@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] =
SYSENT_NOSYS(),
SYSENT_C("llseek", llseek32, 4)),
/* 176 */ SYSENT_LOADABLE(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE(), /* kaio */
/* 179 */ SYSENT_LOADABLE(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] =
/* 174 */ SYSENT_CI("pwrite", pwrite32, 4),
/* 175 */ SYSENT_C("llseek", llseek32, 4),
/* 176 */ SYSENT_LOADABLE32(), /* inst_sync */
- /* 177 */ SYSENT_CI("brandsys", brandsys, 6),
+ /* 177 */ SYSENT_CI("brandsys", brandsys, 5),
/* 178 */ SYSENT_LOADABLE32(), /* kaio */
/* 179 */ SYSENT_LOADABLE32(), /* cpc */
/* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3),
@@ -1094,18 +1094,20 @@ char **syscallnames;
systrace_sysent_t *systrace_sysent;
void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
- uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
/*ARGSUSED*/
void
systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
- uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+ uintptr_t arg6, uintptr_t arg7)
{}
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
dtrace_id_t id;
@@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32;
/*ARGSUSED*/
int64_t
dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
- uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+ uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+ uintptr_t arg7)
{
systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
dtrace_id_t id;
@@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
proc_t *p;
if ((id = sy->stsy_entry) != DTRACE_IDNONE)
- (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+ (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
/*
* We want to explicitly allow DTrace consumers to stop a process
@@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
}
mutex_exit(&p->p_lock);
- rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+ rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+ arg7);
if (ttolwp(curthread)->lwp_errno != 0)
rval = -1;
if ((id = sy->stsy_return) != DTRACE_IDNONE)
(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
- (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+ (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
return (rval);
}
@@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void)
}
if ((id = sy->stsy_return) != DTRACE_IDNONE)
- (*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+ (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
}
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index b25a6cbcf1..5453ebf380 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,11 +25,12 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/timer.h>
#include <sys/systm.h>
+#include <sys/sysmacros.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/debug.h>
@@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)
* waiters. p_lock must be held on entry; it will not be dropped by
* timer_unlock().
*/
+/* ARGSUSED */
static void
timer_unlock(proc_t *p, itimer_t *it)
{
@@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
timer_lock(p, it);
}
+ ASSERT(p->p_itimer_sz > tid);
ASSERT(p->p_itimer[tid] == it);
p->p_itimer[tid] = NULL;
@@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
it->it_backend->clk_timer_delete(it);
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portev) {
port_kevent_t *pev;
@@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
static itimer_t *
timer_grab(proc_t *p, timer_t tid)
{
- itimer_t **itp, *it;
+ itimer_t *it;
- if (tid >= timer_max || tid < 0)
+ if (tid < 0) {
return (NULL);
+ }
mutex_enter(&p->p_lock);
-
- if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) {
+ if (p->p_itimer == NULL || tid >= p->p_itimer_sz ||
+ (it = p->p_itimer[tid]) == NULL) {
mutex_exit(&p->p_lock);
return (NULL);
}
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (it->it_lock & ITLK_REMOVE) {
@@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
* should not be held on entry; timer_release() will acquire p_lock but
* will drop it before returning.
*/
-static void
+void
timer_release(proc_t *p, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
* p_lock should not be held on entry; timer_delete_grabbed() will acquire
* p_lock, but will drop it before returning.
*/
-static void
+void
timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -258,6 +263,13 @@ clock_timer_init()
{
clock_timer_cache = kmem_cache_create("timer_cache",
sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * Push the timer_max limit up to at least 4 * NCPU. Due to the way
+ * NCPU is defined, proper initialization of the timer limit is
+ * performed at runtime.
+ */
+ timer_max = MAX(NCPU * 4, timer_max);
}
void
@@ -453,6 +465,9 @@ timer_fire(itimer_t *it)
it->it_pending = 1;
port_send_event((port_kevent_t *)it->it_portev);
mutex_exit(&it->it_mutex);
+ } else if (it->it_flags & IT_CALLBACK) {
+ it->it_cb_func(it);
+ ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
} else if (it->it_flags & IT_SIGNAL) {
it->it_pending = 1;
mutex_exit(&it->it_mutex);
@@ -466,159 +481,175 @@ timer_fire(itimer_t *it)
mutex_exit(&p->p_lock);
}
-int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+/*
+ * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
+ * Acquires p_lock and holds it on return, regardless of success.
+ */
+static itimer_t *
+timer_alloc(proc_t *p, timer_t *id)
{
- struct sigevent ev;
- proc_t *p = curproc;
- clock_backend_t *backend;
- itimer_t *it, **itp;
- sigqueue_t *sigq;
- cred_t *cr = CRED();
- int error = 0;
- timer_t i;
- port_notify_t tim_pnevp;
- port_kevent_t *pkevp = NULL;
+ itimer_t *it, **itp = NULL;
+ uint_t i;
- if ((backend = CLOCK_BACKEND(clock)) == NULL)
- return (set_errno(EINVAL));
+ ASSERT(MUTEX_NOT_HELD(&p->p_lock));
- if (evp != NULL) {
- /*
- * short copyin() for binary compatibility
- * fetch oldsigevent to determine how much to copy in.
- */
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- if (copyin(evp, &ev, sizeof (struct oldsigevent)))
- return (set_errno(EFAULT));
+ it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+ bzero(it, sizeof (itimer_t));
+ mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
- sizeof (port_notify_t)))
- return (set_errno(EFAULT));
+ mutex_enter(&p->p_lock);
+retry:
+ if (p->p_itimer != NULL) {
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if (p->p_itimer[i] == NULL) {
+ itp = &(p->p_itimer[i]);
+ break;
}
-#ifdef _SYSCALL32_IMPL
- } else {
- struct sigevent32 ev32;
- port_notify32_t tim_pnevp32;
+ }
+ }
- if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
- return (set_errno(EFAULT));
- ev.sigev_notify = ev32.sigev_notify;
- ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * A suitable slot was not found. If possible, allocate (or resize)
+ * the p_itimer array and try again.
+ */
+ if (itp == NULL) {
+ uint_t target_sz = _TIMER_ALLOC_INIT;
+ itimer_t **itp_new;
+
+ if (p->p_itimer != NULL) {
+ ASSERT(p->p_itimer_sz != 0);
+
+ target_sz = p->p_itimer_sz * 2;
+ }
+ /*
+ * Protect against exceeding the max or overflow
+ */
+ if (target_sz > timer_max || target_sz > INT_MAX ||
+ target_sz < p->p_itimer_sz) {
+ kmem_cache_free(clock_timer_cache, it);
+ return (NULL);
+ }
+ mutex_exit(&p->p_lock);
+ itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
+ KM_SLEEP);
+ mutex_enter(&p->p_lock);
+ if (target_sz <= p->p_itimer_sz) {
/*
- * See comment in sigqueue32() on handling of 32-bit
- * sigvals in a 64-bit kernel.
+ * A racing thread performed the resize while we were
+ * waiting outside p_lock. Discard our now-useless
+ * allocation and retry.
*/
- ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin((void *)(uintptr_t)
- ev32.sigev_value.sival_ptr,
- (void *)&tim_pnevp32,
- sizeof (port_notify32_t)))
- return (set_errno(EFAULT));
- tim_pnevp.portnfy_port =
- tim_pnevp32.portnfy_port;
- tim_pnevp.portnfy_user =
- (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+ goto retry;
+ } else {
+ /*
+ * Instantiate the larger allocation and select the
+ * first fresh entry for use.
+ */
+ if (p->p_itimer != NULL) {
+ uint_t old_sz;
+
+ old_sz = p->p_itimer_sz;
+ bcopy(p->p_itimer, itp_new,
+ old_sz * sizeof (itimer_t *));
+ kmem_free(p->p_itimer,
+ old_sz * sizeof (itimer_t *));
+
+ /*
+ * Short circuit to use the first free entry in
+ * the new allocation. It's possible that
+ * other lower-indexed timers were freed while
+ * p_lock was dropped, but skipping over them
+ * is not harmful at all. In the common case,
+ * we skip the need to walk over an array
+ * filled with timers before arriving at the
+ * slot we know is fresh from the allocation.
+ */
+ i = old_sz;
+ } else {
+ /*
+ * For processes lacking any existing timers,
+ * we can simply select the first entry.
+ */
+ i = 0;
}
-#endif
+ p->p_itimer = itp_new;
+ p->p_itimer_sz = target_sz;
}
- switch (ev.sigev_notify) {
- case SIGEV_NONE:
- break;
- case SIGEV_SIGNAL:
- if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
- return (set_errno(EINVAL));
- break;
- case SIGEV_THREAD:
- case SIGEV_PORT:
- break;
- default:
- return (set_errno(EINVAL));
- }
- } else {
- /*
- * Use the clock's default sigevent (this is a structure copy).
- */
- ev = backend->clk_default;
}
+ ASSERT(i <= INT_MAX);
+ *id = (timer_t)i;
+ return (it);
+}
+
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend. Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete(). This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
+int
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+ itimer_t **itp, timer_t *tidp)
+{
+ proc_t *p = curproc;
+ int error = 0;
+ itimer_t *it;
+ sigqueue_t *sigq;
+ timer_t tid;
+
/*
- * We'll allocate our timer and sigqueue now, before we grab p_lock.
- * If we can't find an empty slot, we'll free them before returning.
+ * We'll allocate our sigqueue now, before we grab p_lock.
+ * If we can't find an empty slot, we'll free it before returning.
*/
- it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
- bzero(it, sizeof (itimer_t));
- mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
/*
- * If this is this process' first timer, we need to attempt to allocate
- * an array of timerstr_t pointers. We drop p_lock to perform the
- * allocation; if we return to discover that p_itimer is non-NULL,
- * we will free our allocation and drive on.
+ * Allocate a timer and choose a slot for it. This acquires p_lock.
*/
- if ((itp = p->p_itimer) == NULL) {
- mutex_exit(&p->p_lock);
- itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP);
- mutex_enter(&p->p_lock);
-
- if (p->p_itimer == NULL)
- p->p_itimer = itp;
- else {
- kmem_free(itp, timer_max * sizeof (itimer_t *));
- itp = p->p_itimer;
- }
- }
-
- for (i = 0; i < timer_max && itp[i] != NULL; i++)
- continue;
+ it = timer_alloc(p, &tid);
+ ASSERT(MUTEX_HELD(&p->p_lock));
- if (i == timer_max) {
- /*
- * We couldn't find a slot. Drop p_lock, free the preallocated
- * timer and sigqueue, and return an error.
- */
+ if (it == NULL) {
mutex_exit(&p->p_lock);
- kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
-
- return (set_errno(EAGAIN));
+ return (EAGAIN);
}
- ASSERT(i < timer_max && itp[i] == NULL);
+ ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
+ ASSERT(evp != NULL);
/*
* If we develop other notification mechanisms, this will need
* to call into (yet another) backend.
*/
- sigq->sq_info.si_signo = ev.sigev_signo;
- if (evp == NULL)
- sigq->sq_info.si_value.sival_int = i;
- else
- sigq->sq_info.si_value = ev.sigev_value;
+ sigq->sq_info.si_signo = evp->sigev_signo;
+ sigq->sq_info.si_value = evp->sigev_value;
sigq->sq_info.si_code = SI_TIMER;
sigq->sq_info.si_pid = p->p_pid;
sigq->sq_info.si_ctid = PRCTID(p);
sigq->sq_info.si_zoneid = getzoneid();
- sigq->sq_info.si_uid = crgetruid(cr);
+ sigq->sq_info.si_uid = crgetruid(CRED());
sigq->sq_func = timer_signal;
sigq->sq_next = NULL;
sigq->sq_backptr = it;
it->it_sigq = sigq;
it->it_backend = backend;
it->it_lock = ITLK_LOCKED;
- itp[i] = it;
-
- if (ev.sigev_notify == SIGEV_THREAD ||
- ev.sigev_notify == SIGEV_PORT) {
+ if (evp->sigev_notify == SIGEV_THREAD ||
+ evp->sigev_notify == SIGEV_PORT) {
int port;
+ port_kevent_t *pkevp = NULL;
+
+ ASSERT(pnp != NULL);
/*
* This timer is programmed to use event port notification when
@@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
*/
it->it_flags |= IT_PORT;
- port = tim_pnevp.portnfy_port;
+ port = pnp->portnfy_port;
/* associate timer as event source with the port */
error = port_associate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t **)&it->it_portsrc, timer_close_port,
(void *)it, NULL);
if (error) {
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* allocate an event structure/slot */
@@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
if (error) {
(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,
(port_source_t *)it->it_portsrc);
- itp[i] = NULL; /* clear slot */
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* initialize event data */
- port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+ port_init_event(pkevp, tid, pnp->portnfy_user,
timer_port_callback, it);
it->it_portev = pkevp;
it->it_portfd = port;
} else {
- if (ev.sigev_notify == SIGEV_SIGNAL)
+ if (evp->sigev_notify == SIGEV_SIGNAL)
it->it_flags |= IT_SIGNAL;
}
+ /* Populate the slot now that the timer is prepped. */
+ p->p_itimer[tid] = it;
mutex_exit(&p->p_lock);
/*
@@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
it->it_lwp = ttolwp(curthread);
it->it_proc = p;
- if (copyout(&i, tid, sizeof (timer_t)) != 0) {
- error = EFAULT;
- goto err;
- }
-
- /*
- * If we're here, then we have successfully created the timer; we
- * just need to release the timer and return.
- */
- timer_release(p, it);
-
+ *itp = it;
+ *tidp = tid;
return (0);
err:
@@ -708,11 +730,115 @@ err:
* impossible for a removal to be pending.
*/
ASSERT(!(it->it_lock & ITLK_REMOVE));
- timer_delete_grabbed(p, i, it);
+ timer_delete_grabbed(p, tid, it);
- return (set_errno(error));
+ return (error);
}
+
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+ int error = 0;
+ proc_t *p = curproc;
+ clock_backend_t *backend;
+ struct sigevent ev;
+ itimer_t *it;
+ timer_t tid;
+ port_notify_t tim_pnevp;
+
+ if ((backend = CLOCK_BACKEND(clock)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (evp != NULL) {
+ /*
+ * short copyin() for binary compatibility
+ * fetch oldsigevent to determine how much to copy in.
+ */
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+ return (set_errno(EFAULT));
+
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+ sizeof (port_notify_t)))
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ } else {
+ struct sigevent32 ev32;
+ port_notify32_t tim_pnevp32;
+
+ if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+ return (set_errno(EFAULT));
+ ev.sigev_notify = ev32.sigev_notify;
+ ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * See comment in sigqueue32() on handling of 32-bit
+ * sigvals in a 64-bit kernel.
+ */
+ ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin((void *)(uintptr_t)
+ ev32.sigev_value.sival_ptr,
+ (void *)&tim_pnevp32,
+ sizeof (port_notify32_t)))
+ return (set_errno(EFAULT));
+ tim_pnevp.portnfy_port =
+ tim_pnevp32.portnfy_port;
+ tim_pnevp.portnfy_user =
+ (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ }
+#endif
+ }
+ switch (ev.sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+ return (set_errno(EINVAL));
+ break;
+ case SIGEV_THREAD:
+ case SIGEV_PORT:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ } else {
+ /*
+ * Use the clock's default sigevent (this is a structure copy).
+ */
+ ev = backend->clk_default;
+ }
+
+ if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * Populate si_value with the timer ID if no sigevent was passed in.
+ */
+ if (evp == NULL) {
+ it->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ timer_delete_grabbed(p, tid, it);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * If we're here, then we have successfully created the timer; we
+ * just need to release the timer and return.
+ */
+ timer_release(p, it);
+
+ return (0);
+}
+
+
int
timer_gettime(timer_t tid, itimerspec_t *val)
{
@@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)
void
timer_lwpexit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL) {
continue;
+ }
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) {
@@ -876,20 +1005,22 @@ timer_lwpexit(void)
void
timer_lwpbind()
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
klwp_t *lwp = ttolwp(curthread);
- itimer_t *it, **itp;
+ itimer_t *it;
ASSERT(MUTEX_HELD(&p->p_lock));
- if ((itp = p->p_itimer) == NULL)
+ if (p->p_itimer == NULL) {
return;
+ }
- for (i = 0; i < timer_max; i++) {
- if ((it = itp[i]) == NULL)
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if ((it = p->p_itimer[i]) == NULL)
continue;
+ /* This may drop p_lock temporarily. */
timer_lock(p, it);
if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) {
@@ -911,16 +1042,19 @@ timer_lwpbind()
void
timer_exit(void)
{
- timer_t i;
+ uint_t i;
proc_t *p = curproc;
ASSERT(p->p_itimer != NULL);
+ ASSERT(p->p_itimer_sz != 0);
- for (i = 0; i < timer_max; i++)
- (void) timer_delete(i);
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ (void) timer_delete((timer_t)i);
+ }
- kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *));
+ kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
p->p_itimer = NULL;
+ p->p_itimer_sz = 0;
}
/*
@@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
for (tid = 0; tid < timer_max; tid++) {
if ((it = timer_grab(p, tid)) == NULL)
continue;
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portfd == port) {
port_kevent_t *pev;
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 61acc6cf97..53be806026 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)
void
hrt2ts(hrtime_t hrt, timestruc_t *tsp)
{
+#if defined(__amd64)
+ /*
+ * The cleverness explained above is unecessary on x86_64 CPUs where
+ * modern compilers are able to optimize down to faster operations.
+ */
+ tsp->tv_sec = hrt / NANOSEC;
+ tsp->tv_nsec = hrt % NANOSEC;
+#else
uint32_t sec, nsec, tmp;
tmp = (uint32_t)(hrt >> 30);
@@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
}
tsp->tv_sec = (time_t)sec;
tsp->tv_nsec = nsec;
+#endif /* defined(__amd64) */
}
/*
* Convert from timestruc_t to hrtime_t.
- *
- * The code below is equivalent to:
- *
- * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
- *
- * but requires no integer multiply.
*/
hrtime_t
ts2hrt(const timestruc_t *tsp)
{
+#if defined(__amd64) || defined(__i386)
+ /*
+ * On modern x86 CPUs, the simple version is faster.
+ */
+ return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
+#else
+ /*
+ * The code below is equivalent to:
+ *
+ * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
+ *
+ * but requires no integer multiply.
+ */
hrtime_t hrt;
hrt = tsp->tv_sec;
@@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
hrt = (hrt << 7) - hrt - hrt - hrt;
hrt = (hrt << 9) + tsp->tv_nsec;
return (hrt);
+#endif /* defined(__amd64) || defined(__i386) */
}
/*
@@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)
void
hrt2tv(hrtime_t hrt, struct timeval *tvp)
{
+#if defined(__amd64)
+ /*
+ * Like hrt2ts, the simple version is faster on x86_64.
+ */
+ tvp->tv_sec = hrt / NANOSEC;
+ tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC);
+#else
uint32_t sec, nsec, tmp;
uint32_t q, r, t;
@@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)
sec++;
}
tvp->tv_sec = (time_t)sec;
-/*
- * this routine is very similar to hr2ts, but requires microseconds
- * instead of nanoseconds, so an interger divide by 1000 routine
- * completes the conversion
- */
+ /*
+ * this routine is very similar to hr2ts, but requires microseconds
+ * instead of nanoseconds, so an interger divide by 1000 routine
+ * completes the conversion
+ */
t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
q = q >> 9;
r = nsec - q*1000;
tvp->tv_usec = q + ((r + 24) >> 10);
-
+#endif /* defined(__amd64) */
}
int
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index 608208bbca..f5ee76a2cb 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -58,6 +59,7 @@
#include <sys/tnf_probe.h>
#include <sys/mem_cage.h>
#include <sys/time.h>
+#include <sys/zone.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -73,7 +75,7 @@ static int checkpage(page_t *, int);
* algorithm. They are initialized to 0, and then computed at boot time
* based on the size of the system. If they are patched non-zero in
* a loaded vmunix they are left alone and may thus be changed per system
- * using adb on the loaded system.
+ * using mdb on the loaded system.
*/
pgcnt_t slowscan = 0;
pgcnt_t fastscan = 0;
@@ -81,6 +83,7 @@ pgcnt_t fastscan = 0;
static pgcnt_t handspreadpages = 0;
static int loopfraction = 2;
static pgcnt_t looppages;
+/* See comment below describing 4% and 80% */
static int min_percent_cpu = 4;
static int max_percent_cpu = 80;
static pgcnt_t maxfastscan = 0;
@@ -98,14 +101,34 @@ pgcnt_t deficit;
pgcnt_t nscan;
pgcnt_t desscan;
+/* kstats */
+uint64_t low_mem_scan;
+uint64_t zone_cap_scan;
+uint64_t n_throttle;
+
+clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */
+
/*
* Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
* are the number of ticks in each wakeup cycle that gives the
* equivalent of some underlying %CPU duty cycle.
- * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
- * awakened every 25 clock ticks. So, converting from %CPU to ticks
- * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
- * So, for example, 4% == 1 tick and 80% == 20 ticks.
+ *
+ * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
+ * will run 4 times/sec to update pageout scanning parameters and kickoff
+ * the pageout_scanner() thread if necessary.
+ *
+ * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
+ * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
+ *
+ * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
+ * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 80% of a CPU
+ * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
+ * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
+ *
+ * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
+ * will be 200, so the CPU percentages are the same as when hz is 100.
*
* min_pageout_ticks:
* ticks/wakeup equivalent of min_percent_cpu.
@@ -117,19 +140,29 @@ pgcnt_t desscan;
* Number of clock ticks budgeted for each wakeup cycle.
* Computed each time around by schedpaging().
* Varies between min_pageout_ticks .. max_pageout_ticks,
- * depending on memory pressure.
- *
- * pageout_lbolt:
- * Timestamp of the last time pageout_scanner woke up and started
- * (or resumed) scanning for not recently referenced pages.
+ * depending on memory pressure or zones over their cap.
*/
static clock_t min_pageout_ticks;
static clock_t max_pageout_ticks;
static clock_t pageout_ticks;
-static clock_t pageout_lbolt;
-static uint_t reset_hands;
+#define MAX_PSCAN_THREADS 16
+static boolean_t reset_hands[MAX_PSCAN_THREADS];
+
+/*
+ * These can be tuned in /etc/system or set with mdb.
+ * 'des_page_scanners' is the desired number of page scanner threads. The
+ * system will bring the actual number of threads into line with the desired
+ * number. If des_page_scanners is set to an invalid value, the system will
+ * correct the setting.
+ */
+uint_t des_page_scanners;
+uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
+
+uint_t n_page_scanners;
+static pgcnt_t pscan_region_sz; /* informational only */
+
#define PAGES_POLL_MASK 1023
@@ -145,33 +178,37 @@ static uint_t reset_hands;
* pageout_sample_pages:
* The accumulated number of pages scanned during sampling.
*
- * pageout_sample_ticks:
- * The accumulated clock ticks for the sample.
+ * pageout_sample_etime:
+ * The accumulated number of nanoseconds for the sample.
*
* pageout_rate:
- * Rate in pages/nanosecond, computed at the end of sampling.
+ * Rate in pages/second, computed at the end of sampling.
*
* pageout_new_spread:
- * The new value to use for fastscan and handspreadpages.
- * Calculated after enough samples have been taken.
+ * The new value to use for maxfastscan and (perhaps) handspreadpages.
+ * Intended to be the number pages that can be scanned per sec using ~10%
+ * of a CPU. Calculated after enough samples have been taken.
+ * pageout_rate / 10
*/
typedef hrtime_t hrrate_t;
-static uint64_t pageout_sample_lim = 4;
-static uint64_t pageout_sample_cnt = 0;
+static uint_t pageout_sample_lim = 4;
+static uint_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
-static clock_t pageout_cycle_ticks;
-static hrtime_t sample_start, sample_end;
static hrtime_t pageout_sample_etime = 0;
+/* True if page scanner is first starting up */
+#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
+
/*
* Record number of times a pageout_scanner wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
- * its budgeted number of pages.
+ * its budgeted number of pages. This is only done when scanning under low
+ * free memory conditions, not when scanning for zones over their cap.
*/
uint64_t pageout_timeouts = 0;
@@ -194,25 +231,35 @@ kcondvar_t memavail_cv;
#define LOOPPAGES total_pages
/*
- * Set up the paging constants for the clock algorithm.
- * Called after the system is initialized and the amount of memory
- * and number of paging devices is known.
+ * Local boolean to control scanning when zones are over their cap. Avoids
+ * accessing the zone_num_over_cap variable except within schedpaging(), which
+ * only runs periodically. This is here only to reduce our access to
+ * zone_num_over_cap, since it is already accessed a lot during paging, and
+ * the page scanner accesses the zones_over variable on each page during a
+ * scan. There is no lock needed for zone_num_over_cap since schedpaging()
+ * doesn't modify the variable, it only cares if the variable is 0 or non-0.
+ */
+static boolean_t zones_over = B_FALSE;
+
+/*
+ * Set up the paging constants for the page scanner clock-hand algorithm.
+ * Called at startup after the system is initialized and the amount of memory
+ * and number of paging devices is known (recalc will be 0). Called again once
+ * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
+ * (recalc will be 1).
+ *
+ * Will also be called after a memory dynamic reconfiguration operation and
+ * recalc will be 1 in those cases too.
*
- * lotsfree is 1/64 of memory, but at least 512K.
+ * lotsfree is 1/64 of memory, but at least 512K (ha!).
* desfree is 1/2 of lotsfree.
* minfree is 1/2 of desfree.
- *
- * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
- *
- * lotsfree = btop(512K)
- * desfree = btop(200K)
- * minfree = btop(100K)
- * throttlefree = INT_MIN
- * max_percent_cpu = 4
*/
void
setupclock(int recalc)
{
+ uint_t i;
+ pgcnt_t sz, tmp;
static spgcnt_t init_lfree, init_dfree, init_mfree;
static spgcnt_t init_tfree, init_preserve, init_mpgio;
@@ -221,8 +268,8 @@ setupclock(int recalc)
looppages = LOOPPAGES;
/*
- * setupclock can now be called to recalculate the paging
- * parameters in the case of dynamic addition of memory.
+ * setupclock can be called to recalculate the paging
+ * parameters in the case of dynamic reconfiguration of memory.
* So to make sure we make the proper calculations, if such a
* situation should arise, we save away the initial values
* of each parameter so we can recall them when needed. This
@@ -311,105 +358,98 @@ setupclock(int recalc)
maxpgio = init_mpgio;
/*
- * The clock scan rate varies between fastscan and slowscan
- * based on the amount of free memory available. Fastscan
- * rate should be set based on the number pages that can be
- * scanned per sec using ~10% of processor time. Since this
- * value depends on the processor, MMU, Mhz etc., it is
- * difficult to determine it in a generic manner for all
- * architectures.
+ * When the system is in a low memory state, the page scan rate varies
+ * between fastscan and slowscan based on the amount of free memory
+ * available. When only zones are over their memory cap, the scan rate
+ * is always fastscan.
*
- * Instead of trying to determine the number of pages scanned
- * per sec for every processor, fastscan is set to be the smaller
- * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
- * time is limited to ~4% of processor time.
+ * The fastscan rate should be set based on the number pages that can
+ * be scanned per sec using ~10% of a CPU. Since this value depends on
+ * the processor, MMU, Ghz etc., it must be determined dynamically.
*
- * Setting fastscan to be 1/2 of memory allows pageout to scan
- * all of memory in ~2 secs. This implies that user pages not
- * accessed within 1 sec (assuming, handspreadpages == fastscan)
- * can be reclaimed when free memory is very low. Stealing pages
- * not accessed within 1 sec seems reasonable and ensures that
- * active user processes don't thrash.
+ * When the scanner first starts up, fastscan will be set to 0 and
+ * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
+ * However, once the scanner has collected enough samples, then fastscan
+ * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
+ * or maxfastscan (which is set from pageout_new_spread). Thus,
+ * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
+ * initialized.
*
- * Smaller values of fastscan result in scanning fewer pages
- * every second and consequently pageout may not be able to free
- * sufficient memory to maintain the minimum threshold. Larger
- * values of fastscan result in scanning a lot more pages which
- * could lead to thrashing and higher CPU usage.
+ * pageout_new_spread is calculated when the scanner first starts
+ * running. During this initial sampling period the nscan_limit
+ * is set to the total_pages of system memory. Thus, the scanner could
+ * theoretically scan all of memory in one pass. However, each sample
+ * is also limited by the %CPU budget. This is controlled by
+ * pageout_ticks which is set in schedpaging(). During the sampling
+ * period, pageout_ticks is set to max_pageout_ticks. This tick value
+ * is derived from the max_percent_cpu (80%) described above. On a
+ * system with more than a small amount of memory (~8GB), the scanner's
+ * %CPU will be the limiting factor in calculating pageout_new_spread.
*
- * Fastscan needs to be limited to a maximum value and should not
- * scale with memory to prevent pageout from consuming too much
- * time for scanning on slow CPU's and avoid thrashing, as a
- * result of scanning too many pages, on faster CPU's.
- * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
- * (the upper bound for fastscan) based on the average number
- * of pages that can potentially be scanned in ~1 sec (using ~4%
- * of the CPU) on some of the following machines that currently
- * run Solaris 2.x:
+ * At the end of the sampling period, the pageout_rate indicates how
+ * many pages could be scanned per second. The pageout_new_spread is
+ * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
+ * Of course, this value could still be more than the physical memory
+ * on the system. If so, fastscan is set to 1/2 of memory, as
+ * mentioned above.
*
- * average memory scanned in ~1 sec
+ * All of this leads up to the setting of handspreadpages, which is
+ * set to fastscan. This is the distance, in pages, between the front
+ * and back hands during scanning. It will dictate which pages will
+ * be considered "hot" on the backhand and which pages will be "cold"
+ * and reclaimed
*
- * 25 Mhz SS1+: 23 Meg
- * LX: 37 Meg
- * 50 Mhz SC2000: 68 Meg
+ * If the scanner is limited by desscan, then at the highest rate it
+ * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
+ * scanner is limited by the %CPU, then at the highest rate (20% of a
+ * CPU per cycle) the number of pages scanned could be much less.
*
- * 40 Mhz 486: 26 Meg
- * 66 Mhz 486: 42 Meg
+ * Thus, if the scanner is limited by desscan, then the handspreadpages
+ * setting means 1sec between the front and back hands, but if the
+ * scanner is limited by %CPU, it could be several seconds between the
+ * two hands.
*
- * When free memory falls just below lotsfree, the scan rate
- * goes from 0 to slowscan (i.e., pageout starts running). This
+ * The basic assumption is that at the worst case, stealing pages
+ * not accessed within 1 sec seems reasonable and ensures that active
+ * user processes don't thrash. This is especially true when the system
+ * is in a low memory state.
+ *
+ * There are some additional factors to consider for the case of
+ * scanning when zones are over their cap. In this situation it is
+ * also likely that the machine will have a large physical memory which
+ * will take many seconds to fully scan (due to the %CPU and desscan
+ * limits per cycle). It is probable that there will be few (or 0)
+ * pages attributed to these zones in any single scanning cycle. The
+ * result is that reclaiming enough pages for these zones might take
+ * several additional seconds (this is generally not a problem since
+ * the zone physical cap is just a soft cap).
+ *
+ * This is similar to the typical multi-processor situation in which
+ * pageout is often unable to maintain the minimum paging thresholds
+ * under heavy load due to the fact that user processes running on
+ * other CPU's can be dirtying memory at a much faster pace than
+ * pageout can find pages to free.
+ *
+ * One potential approach to address both of these cases is to enable
+ * more than one CPU to run the page scanner, in such a manner that the
+ * various clock hands don't overlap. However, this also makes it more
+ * difficult to determine the values for fastscan, slowscan and
+ * handspreadpages. This is left as a future enhancement, if necessary.
+ *
+ * When free memory falls just below lotsfree, the scan rate goes from
+ * 0 to slowscan (i.e., the page scanner starts running). This
* transition needs to be smooth and is achieved by ensuring that
* pageout scans a small number of pages to satisfy the transient
* memory demand. This is set to not exceed 100 pages/sec (25 per
* wakeup) since scanning that many pages has no noticible impact
* on system performance.
*
- * In addition to setting fastscan and slowscan, pageout is
- * limited to using ~4% of the CPU. This results in increasing
- * the time taken to scan all of memory, which in turn means that
- * user processes have a better opportunity of preventing their
- * pages from being stolen. This has a positive effect on
- * interactive and overall system performance when memory demand
- * is high.
- *
- * Thus, the rate at which pages are scanned for replacement will
- * vary linearly between slowscan and the number of pages that
- * can be scanned using ~4% of processor time instead of varying
- * linearly between slowscan and fastscan.
- *
- * Also, the processor time used by pageout will vary from ~1%
- * at slowscan to ~4% at fastscan instead of varying between
- * ~1% at slowscan and ~10% at fastscan.
- *
- * The values chosen for the various VM parameters (fastscan,
- * handspreadpages, etc) are not universally true for all machines,
- * but appear to be a good rule of thumb for the machines we've
- * tested. They have the following ranges:
- *
- * cpu speed: 20 to 70 Mhz
- * page size: 4K to 8K
- * memory size: 16M to 5G
- * page scan rate: 4000 - 17400 4K pages per sec
- *
- * The values need to be re-examined for machines which don't
- * fall into the various ranges (e.g., slower or faster CPUs,
- * smaller or larger pagesizes etc) shown above.
- *
- * On an MP machine, pageout is often unable to maintain the
- * minimum paging thresholds under heavy load. This is due to
- * the fact that user processes running on other CPU's can be
- * dirtying memory at a much faster pace than pageout can find
- * pages to free. The memory demands could be met by enabling
- * more than one CPU to run the clock algorithm in such a manner
- * that the various clock hands don't overlap. This also makes
- * it more difficult to determine the values for fastscan, slowscan
- * and handspreadpages.
- *
- * The swapper is currently used to free up memory when pageout
- * is unable to meet memory demands by swapping out processes.
- * In addition to freeing up memory, swapping also reduces the
- * demand for memory by preventing user processes from running
- * and thereby consuming memory.
+ * The swapper is currently used to free up memory when pageout is
+ * unable to meet memory demands. It does this by swapping out entire
+ * processes. In addition to freeing up memory, swapping also reduces
+ * the demand for memory because the swapped out processes cannot
+ * run, and thereby consume memory. However, this is a pathological
+ * state and performance will generally be considered unacceptable.
*/
if (init_mfscan == 0) {
if (pageout_new_spread != 0)
@@ -419,12 +459,13 @@ setupclock(int recalc)
} else {
maxfastscan = init_mfscan;
}
- if (init_fscan == 0)
+ if (init_fscan == 0) {
fastscan = MIN(looppages / loopfraction, maxfastscan);
- else
+ } else {
fastscan = init_fscan;
- if (fastscan > looppages / loopfraction)
- fastscan = looppages / loopfraction;
+ if (fastscan > looppages / loopfraction)
+ fastscan = looppages / loopfraction;
+ }
/*
* Set slow scan time to 1/10 the fast scan time, but
@@ -444,12 +485,10 @@ setupclock(int recalc)
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
- * Since pageout is limited to ~4% of the CPU, setting handspreadpages
- * to be "fastscan" results in the front hand being a few secs
- * (varies based on the processor speed) ahead of the back hand
- * at fastscan rates. This distance can be further reduced, if
- * necessary, by increasing the processor time used by pageout
- * to be more than ~4% and preferrably not more than ~10%.
+ * Since pageout is limited to the %CPU per cycle, setting
+ * handspreadpages to be "fastscan" results in the front hand being
+ * a few secs (varies based on the processor speed) ahead of the back
+ * hand at fastscan rates.
*
* As a result, user processes have a much better chance of
* referencing their pages before the back hand examines them.
@@ -471,29 +510,78 @@ setupclock(int recalc)
if (handspreadpages >= looppages)
handspreadpages = looppages - 1;
+ if (recalc == 0) {
+ /*
+ * Setup basic values at initialization.
+ */
+ pscan_region_sz = total_pages;
+ des_page_scanners = n_page_scanners = 1;
+ reset_hands[0] = B_TRUE;
+ return;
+ }
+
/*
- * If we have been called to recalculate the parameters,
- * set a flag to re-evaluate the clock hand pointers.
+ * Recalculating
+ *
+ * We originally set the number of page scanners to 1. Now that we
+ * know what the handspreadpages is for a scanner, figure out how many
+ * scanners we should run. We want to ensure that the regions don't
+ * overlap and that they are not touching.
+ *
+ * A default 64GB region size is used as the initial value to calculate
+ * how many scanner threads we should create on lower memory systems.
+ * The idea is to limit the number of threads to a practical value
+ * (e.g. a 64GB machine really only needs one scanner thread). For very
+ * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
+ * threads.
+ *
+ * The scanner threads themselves are evenly spread out around the
+ * memory "clock" in pageout_scanner when we reset the hands, and each
+ * thread will scan all of memory.
*/
- if (recalc)
- reset_hands = 1;
+ sz = (btop(64ULL * 0x40000000ULL));
+ if (sz < handspreadpages) {
+ /*
+ * 64GB is smaller than the separation between the front
+ * and back hands; use double handspreadpages.
+ */
+ sz = handspreadpages << 1;
+ }
+ if (sz > total_pages) {
+ sz = total_pages;
+ }
+ /* Record region size for inspection with mdb, otherwise unused */
+ pscan_region_sz = sz;
+
+ tmp = sz;
+ for (i = 1; tmp < total_pages; i++) {
+ tmp += sz;
+ }
+
+ if (i > MAX_PSCAN_THREADS)
+ i = MAX_PSCAN_THREADS;
+
+ des_page_scanners = i;
}
/*
* Pageout scheduling.
*
* Schedpaging controls the rate at which the page out daemon runs by
- * setting the global variables nscan and desscan RATETOSCHEDPAGING
- * times a second. Nscan records the number of pages pageout has examined
- * in its current pass; schedpaging resets this value to zero each time
- * it runs. Desscan records the number of pages pageout should examine
- * in its next pass; schedpaging sets this value based on the amount of
- * currently available memory.
+ * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
+ * times a second. The pageout_ticks variable controls the percent of one
+ * CPU that each page scanner thread should consume (see min_percent_cpu
+ * and max_percent_cpu descriptions). The desscan variable records the number
+ * of pages pageout should examine in its next pass; schedpaging sets this
+ * value based on the amount of currently available memory. In addtition, the
+ * nscan variable records the number of pages pageout has examined in its
+ * current pass; schedpaging resets this value to zero each time it runs.
*/
-#define RATETOSCHEDPAGING 4 /* hz that is */
+#define RATETOSCHEDPAGING 4 /* times/second */
-static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
+/* held while pageout_scanner or schedpaging are modifying shared data */
+static kmutex_t pageout_mutex;
/*
* Pool of available async pageout putpage requests.
@@ -506,7 +594,7 @@ static kcondvar_t push_cv;
static int async_list_size = 256; /* number of async request structs */
-static void pageout_scanner(void);
+static void pageout_scanner(void *);
/*
* If a page is being shared more than "po_share" times
@@ -535,67 +623,153 @@ schedpaging(void *arg)
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
- if (mutex_tryenter(&pageout_mutex)) {
- /* pageout() not running */
- nscan = 0;
- vavail = freemem - deficit;
- if (pageout_new_spread != 0)
- vavail -= needfree;
- if (vavail < 0)
- vavail = 0;
- if (vavail > lotsfree)
- vavail = lotsfree;
+ (void) atomic_swap_ulong(&nscan, 0);
+ vavail = freemem - deficit;
+ if (pageout_new_spread != 0)
+ vavail -= needfree;
+ if (vavail < 0)
+ vavail = 0;
+ if (vavail > lotsfree)
+ vavail = lotsfree;
+ /*
+ * Fix for 1161438 (CRS SPR# 73922). All variables
+ * in the original calculation for desscan were 32 bit signed
+ * ints. As freemem approaches 0x0 on a system with 1 Gig or
+ * more of memory, the calculation can overflow. When this
+ * happens, desscan becomes negative and pageout_scanner()
+ * stops paging out.
+ */
+ if ((needfree) && (pageout_new_spread == 0)) {
/*
- * Fix for 1161438 (CRS SPR# 73922). All variables
- * in the original calculation for desscan were 32 bit signed
- * ints. As freemem approaches 0x0 on a system with 1 Gig or
- * more of memory, the calculation can overflow. When this
- * happens, desscan becomes negative and pageout_scanner()
- * stops paging out.
+ * If we've not yet collected enough samples to
+ * calculate a spread, kick into high gear anytime
+ * needfree is non-zero. Note that desscan will not be
+ * the limiting factor for systems with larger memory;
+ * the %CPU will limit the scan. That will also be
+ * maxed out below.
*/
- if ((needfree) && (pageout_new_spread == 0)) {
- /*
- * If we've not yet collected enough samples to
- * calculate a spread, use the old logic of kicking
- * into high gear anytime needfree is non-zero.
- */
- desscan = fastscan / RATETOSCHEDPAGING;
- } else {
- /*
- * Once we've calculated a spread based on system
- * memory and usage, just treat needfree as another
- * form of deficit.
- */
- spgcnt_t faststmp, slowstmp, result;
+ desscan = fastscan / RATETOSCHEDPAGING;
+ } else {
+ /*
+ * Once we've calculated a spread based on system
+ * memory and usage, just treat needfree as another
+ * form of deficit.
+ */
+ spgcnt_t faststmp, slowstmp, result;
+
+ slowstmp = slowscan * vavail;
+ faststmp = fastscan * (lotsfree - vavail);
+ result = (slowstmp + faststmp) /
+ nz(lotsfree) / RATETOSCHEDPAGING;
+ desscan = (pgcnt_t)result;
+ }
+
+ /*
+ * If we've not yet collected enough samples to calculate a
+ * spread, also kick %CPU to the max.
+ */
+ if (pageout_new_spread == 0) {
+ pageout_ticks = max_pageout_ticks;
+ } else {
+ pageout_ticks = min_pageout_ticks +
+ (lotsfree - vavail) *
+ (max_pageout_ticks - min_pageout_ticks) /
+ nz(lotsfree);
+ }
- slowstmp = slowscan * vavail;
- faststmp = fastscan * (lotsfree - vavail);
- result = (slowstmp + faststmp) /
- nz(lotsfree) / RATETOSCHEDPAGING;
- desscan = (pgcnt_t)result;
+ if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
+ /*
+ * We have finished the pagescan initialization and the desired
+ * number of page scanners has changed, either because
+ * initialization just finished, because of a memory DR, or
+ * because des_page_scanners has been modified on the fly (i.e.
+ * by mdb). If we need more scanners, start them now, otherwise
+ * the excess scanners will terminate on their own when they
+ * reset their hands.
+ */
+ uint_t i;
+ uint_t curr_nscan = n_page_scanners;
+ pgcnt_t max = total_pages / handspreadpages;
+
+ if (des_page_scanners > max)
+ des_page_scanners = max;
+
+ if (des_page_scanners > MAX_PSCAN_THREADS) {
+ des_page_scanners = MAX_PSCAN_THREADS;
+ } else if (des_page_scanners == 0) {
+ des_page_scanners = 1;
}
- pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
- (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
+ /*
+ * Each thread has its own entry in the reset_hands array, so
+ * we don't need any locking in pageout_scanner to check the
+ * thread's reset_hands entry. Thus, we use a pre-allocated
+ * fixed size reset_hands array and upper limit on the number
+ * of pagescan threads.
+ *
+ * The reset_hands entries need to be true before we start new
+ * scanners, but if we're reducing, we don't want a race on the
+ * recalculation for the existing threads, so we set
+ * n_page_scanners first.
+ */
+ n_page_scanners = des_page_scanners;
+ for (i = 0; i < MAX_PSCAN_THREADS; i++) {
+ reset_hands[i] = B_TRUE;
+ }
- if (freemem < lotsfree + needfree ||
- pageout_sample_cnt < pageout_sample_lim) {
- TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
- "pageout_cv_signal:freemem %ld", freemem);
- cv_signal(&proc_pageout->p_cv);
- } else {
- /*
- * There are enough free pages, no need to
- * kick the scanner thread. And next time
- * around, keep more of the `highly shared'
- * pages.
- */
- cv_signal_pageout();
- if (po_share > MIN_PO_SHARE) {
- po_share >>= 1;
+ if (des_page_scanners > curr_nscan) {
+ /* Create additional pageout scanner threads. */
+ for (i = curr_nscan; i < des_page_scanners; i++) {
+ (void) lwp_kernel_create(proc_pageout,
+ pageout_scanner, (void *)(uintptr_t)i,
+ TS_RUN, curthread->t_pri);
}
}
+ }
+
+ zones_over = B_FALSE;
+
+ if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
+ if (!PAGE_SCAN_STARTUP)
+ low_mem_scan++;
+ DTRACE_PROBE(schedpage__wake__low);
+ WAKE_PAGEOUT_SCANNER();
+
+ } else if (zone_num_over_cap > 0) {
+ /* One or more zones are over their cap. */
+
+ /* No page limit */
+ desscan = total_pages;
+
+ /*
+ * Increase the scanning CPU% to the max. This implies
+ * 80% of one CPU/sec if the scanner can run each
+ * opportunity. Can also be tuned via setting
+ * zone_pageout_ticks in /etc/system or with mdb.
+ */
+ pageout_ticks = (zone_pageout_ticks != 0) ?
+ zone_pageout_ticks : max_pageout_ticks;
+
+ zones_over = B_TRUE;
+ zone_cap_scan++;
+
+ DTRACE_PROBE(schedpage__wake__zone);
+ WAKE_PAGEOUT_SCANNER();
+
+ } else {
+ /*
+ * There are enough free pages, no need to
+ * kick the scanner thread. And next time
+ * around, keep more of the `highly shared'
+ * pages.
+ */
+ cv_signal_pageout();
+
+ mutex_enter(&pageout_mutex);
+ if (po_share > MIN_PO_SHARE) {
+ po_share >>= 1;
+ }
mutex_exit(&pageout_mutex);
}
@@ -617,36 +791,46 @@ ulong_t push_list_size; /* # of requests on pageout queue */
#define FRONT 1
#define BACK 2
-int dopageout = 1; /* must be non-zero to turn page stealing on */
+int dopageout = 1; /* /etc/system tunable to disable page reclamation */
/*
* The page out daemon, which runs as process 2.
*
- * As long as there are at least lotsfree pages,
- * this process is not run. When the number of free
- * pages stays in the range desfree to lotsfree,
- * this daemon runs through the pages in the loop
- * at a rate determined in schedpaging(). Pageout manages
- * two hands on the clock. The front hand moves through
- * memory, clearing the reference bit,
- * and stealing pages from procs that are over maxrss.
- * The back hand travels a distance behind the front hand,
- * freeing the pages that have not been referenced in the time
- * since the front hand passed. If modified, they are pushed to
- * swap before being freed.
+ * Page out occurs when either:
+ * a) there is less than lotsfree pages,
+ * b) there are one or more zones over their physical memory cap.
+ *
+ * The daemon treats physical memory as a circular array of pages and scans the
+ * pages using a 'two-handed clock' algorithm. The front hand moves through
+ * the pages, clearing the reference bit. The back hand travels a distance
+ * (handspreadpages) behind the front hand, freeing the pages that have not
+ * been referenced in the time since the front hand passed. If modified, they
+ * are first written to their backing store before being freed.
+ *
+ * In order to make page invalidation more responsive on machines with larger
+ * memory, multiple pageout_scanner threads may be created. In this case, the
+ * threads are evenly distributed around the the memory "clock face" so that
+ * memory can be reclaimed more quickly (that is, there can be large regions in
+ * which no pages can be reclaimed by a single thread, leading to lag which
+ * causes undesirable behavior such as htable stealing).
+ *
+ * As long as there are at least lotsfree pages, or no zones over their cap,
+ * then pageout_scanner threads are not run. When pageout_scanner threads are
+ * running for case (a), all pages are considered for pageout. For case (b),
+ * only pages belonging to a zone over its cap will be considered for pageout.
*
- * There are 2 threads that act on behalf of the pageout process.
- * One thread scans pages (pageout_scanner) and frees them up if
+ * There are multiple threads that act on behalf of the pageout process.
+ * A set of threads scan pages (pageout_scanner) and frees them up if
* they don't require any VOP_PUTPAGE operation. If a page must be
* written back to its backing store, the request is put on a list
* and the other (pageout) thread is signaled. The pageout thread
* grabs VOP_PUTPAGE requests from the list, and processes them.
* Some filesystems may require resources for the VOP_PUTPAGE
* operations (like memory) and hence can block the pageout
- * thread, but the scanner thread can still operate. There is still
+ * thread, but the pageout_scanner threads can still operate. There is still
* no guarantee that memory deadlocks cannot occur.
*
- * For now, this thing is in very rough form.
+ * The pageout_scanner parameters are determined in schedpaging().
*/
void
pageout()
@@ -684,9 +868,9 @@ pageout()
pageout_pri = curthread->t_pri;
- /* Create the pageout scanner thread. */
- (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
- pageout_pri - 1);
+ /* Create the (first) pageout scanner thread. */
+ (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
+ TS_RUN, pageout_pri - 1);
/*
* kick off pageout scheduler.
@@ -720,6 +904,7 @@ pageout()
arg->a_next = NULL;
mutex_exit(&push_lock);
+ DTRACE_PROBE(pageout__push);
if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
pushes++;
@@ -740,32 +925,24 @@ pageout()
* Kernel thread that scans pages looking for ones to free
*/
static void
-pageout_scanner(void)
+pageout_scanner(void *a)
{
struct page *fronthand, *backhand;
- uint_t count;
+ uint_t count, iter = 0;
callb_cpr_t cprinfo;
- pgcnt_t nscan_limit;
+ pgcnt_t nscan_cnt, nscan_limit;
pgcnt_t pcount;
+ uint_t inst = (uint_t)(uintptr_t)a;
+ hrtime_t sample_start, sample_end;
+ clock_t pageout_lbolt;
+ kmutex_t pscan_mutex;
- CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
- mutex_enter(&pageout_mutex);
+ VERIFY3U(inst, <, MAX_PSCAN_THREADS);
- /*
- * The restart case does not attempt to point the hands at roughly
- * the right point on the assumption that after one circuit things
- * will have settled down - and restarts shouldn't be that often.
- */
+ mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
- /*
- * Set the two clock hands to be separated by a reasonable amount,
- * but no more than 360 degrees apart.
- */
- backhand = page_first();
- if (handspreadpages >= total_pages)
- fronthand = page_nextn(backhand, total_pages - 1);
- else
- fronthand = page_nextn(backhand, handspreadpages);
+ CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
+ mutex_enter(&pscan_mutex);
min_pageout_ticks = MAX(1,
((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
@@ -776,71 +953,116 @@ loop:
cv_signal_pageout();
CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&proc_pageout->p_cv, &pageout_mutex);
- CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+ cv_wait(&proc_pageout->p_cv, &pscan_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
if (!dopageout)
goto loop;
- if (reset_hands) {
- reset_hands = 0;
+ if (reset_hands[inst]) {
+ struct page *first;
+ pgcnt_t offset = total_pages / n_page_scanners;
- backhand = page_first();
- if (handspreadpages >= total_pages)
+ reset_hands[inst] = B_FALSE;
+ if (inst >= n_page_scanners) {
+ /*
+ * The desired number of page scanners has been
+ * reduced and this instance is no longer wanted.
+ * Exit the lwp.
+ */
+ VERIFY3U(inst, !=, 0);
+ mutex_exit(&pscan_mutex);
+ mutex_enter(&curproc->p_lock);
+ lwp_exit();
+ }
+
+ /*
+ * The reset case repositions the hands at the proper place
+ * on the memory clock face to prevent creep into another
+ * thread's active region or when the number of threads has
+ * changed.
+ *
+ * Set the two clock hands to be separated by a reasonable
+ * amount, but no more than 360 degrees apart.
+ *
+ * If inst == 0, backhand starts at first page, otherwise
+ * it is (inst * offset) around the memory "clock face" so that
+ * we spread out each scanner instance evenly.
+ */
+ first = page_first();
+ backhand = page_nextn(first, offset * inst);
+ if (handspreadpages >= total_pages) {
fronthand = page_nextn(backhand, total_pages - 1);
- else
+ } else {
fronthand = page_nextn(backhand, handspreadpages);
+ }
}
+ /*
+ * This CPU kstat is only incremented here and we're obviously on this
+ * CPU, so no lock.
+ */
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
count = 0;
- TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
- "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
- freemem, lotsfree, nscan, desscan);
-
/* Kernel probe */
TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
pcount = 0;
- if (pageout_sample_cnt < pageout_sample_lim) {
+ nscan_cnt = 0;
+ if (PAGE_SCAN_STARTUP) {
nscan_limit = total_pages;
} else {
nscan_limit = desscan;
}
+
+ DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
+ page_t *, backhand, page_t *, fronthand);
+
pageout_lbolt = ddi_get_lbolt();
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
- * However, stop scanning as soon as there is enough free memory.
- * For a short while, we will be sampling the performance of the
- * scanner and need to keep running just to get sample data, in
- * which case we keep going and don't pay attention to whether
- * or not there is enough free memory.
+ * Only scan while at least one of these is true:
+ * 1) one or more zones is over its cap
+ * 2) there is not enough free memory
+ * 3) during page scan startup when determining sample data
*/
-
- while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
- pageout_sample_cnt < pageout_sample_lim)) {
+ while (nscan_cnt < nscan_limit &&
+ (zones_over ||
+ freemem < lotsfree + needfree ||
+ PAGE_SCAN_STARTUP)) {
int rvfront, rvback;
+ DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+
/*
* Check to see if we have exceeded our %CPU budget
* for this wakeup, but not on every single page visited,
* just every once in a while.
*/
if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
+ clock_t pageout_cycle_ticks;
+
pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
if (pageout_cycle_ticks >= pageout_ticks) {
- ++pageout_timeouts;
+ /*
+ * This is where we normally break out of the
+ * loop when scanning zones or sampling.
+ */
+ if (!zones_over) {
+ atomic_inc_64(&pageout_timeouts);
+ }
+ DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
}
/*
* If checkpage manages to add a page to the free list,
- * we give ourselves another couple of trips around the loop.
+ * we give ourselves another couple of trips around memory.
*/
if ((rvfront = checkpage(fronthand, FRONT)) == 1)
count = 0;
@@ -850,7 +1072,8 @@ loop:
++pcount;
/*
- * protected by pageout_mutex instead of cpu_stat_lock
+ * This CPU kstat is only incremented here and we're obviously
+ * on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, scan, 1);
@@ -858,7 +1081,7 @@ loop:
* Don't include ineligible pages in the number scanned.
*/
if (rvfront != -1 || rvback != -1)
- nscan++;
+ nscan_cnt++;
backhand = page_next(backhand);
@@ -868,56 +1091,89 @@ loop:
*/
if ((fronthand = page_next(fronthand)) == page_first()) {
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
- "pageout_hand_wrap:freemem %ld whichhand %d",
- freemem, FRONT);
+ DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
/*
- * protected by pageout_mutex instead of cpu_stat_lock
+ * Every 64 wraps we reposition our hands within our
+ * region to prevent creep into another thread.
+ */
+ if ((++iter % pageout_reset_cnt) == 0)
+ reset_hands[inst] = B_TRUE;
+
+ /*
+ * This CPU kstat is only incremented here and we're
+ * obviously on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, rev, 1);
- if (++count > 1) {
+
+ /*
+ * If scanning because the system is low on memory,
+ * then when we wraparound memory we want to try to
+ * reclaim more pages.
+ * If scanning only because zones are over their cap,
+ * then wrapping is common and we simply keep going.
+ */
+ if (freemem < lotsfree + needfree && ++count > 1) {
/*
+ * The system is low on memory.
* Extremely unlikely, but it happens.
- * We went around the loop at least once
- * and didn't get far enough.
+ * We went around memory at least once
+ * and didn't reclaim enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
+ mutex_enter(&pageout_mutex);
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
+ mutex_exit(&pageout_mutex);
} else {
/*
- * Really a "goto loop", but
- * if someone is TRACing or
- * TNF_PROBE_ing, at least
- * make records to show
- * where we are.
+ * Really a "goto loop", but if someone
+ * is tracing or TNF_PROBE_ing, hit
+ * those probes first.
*/
+ mutex_exit(&pageout_mutex);
break;
}
}
}
}
+ atomic_add_long(&nscan, nscan_cnt);
+
sample_end = gethrtime();
- TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
- "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
- freemem, lotsfree, nscan, desscan, count);
+ DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
+ uint_t, inst);
/* Kernel probe */
TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
- tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
+ tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
+ freemem);
- if (pageout_sample_cnt < pageout_sample_lim) {
+ /*
+ * The following two blocks are only relevant when the scanner is
+ * first started up. After the scanner runs for a while, neither of
+ * the conditions will ever be true again.
+ *
+ * The global variables used below are only modified by this thread and
+ * only during initial scanning when there is a single page scanner
+ * thread running. Thus, we don't use any locking.
+ */
+ if (PAGE_SCAN_STARTUP) {
+ VERIFY3U(inst, ==, 0);
pageout_sample_pages += pcount;
pageout_sample_etime += sample_end - sample_start;
++pageout_sample_cnt;
- }
- if (pageout_sample_cnt >= pageout_sample_lim &&
- pageout_new_spread == 0) {
+
+ } else if (pageout_new_spread == 0) {
+ uint_t i;
+
+ /*
+ * We have run enough samples, set the spread.
+ */
+ VERIFY3U(inst, ==, 0);
pageout_rate = (hrrate_t)pageout_sample_pages *
(hrrate_t)(NANOSEC) / pageout_sample_etime;
pageout_new_spread = pageout_rate / 10;
@@ -931,9 +1187,8 @@ loop:
* Look at the page at hand. If it is locked (e.g., for physical i/o),
* system (u., page table) or free, then leave it alone. Otherwise,
* if we are running the front hand, turn off the page's reference bit.
- * If the proc is over maxrss, we take it. If running the back hand,
- * check whether the page has been reclaimed. If not, free the page,
- * pushing it to disk first if necessary.
+ * If running the back hand, check whether the page has been reclaimed.
+ * If not, free the page, pushing it to disk first if necessary.
*
* Return values:
* -1 if the page is not a candidate at all,
@@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand)
int isfs = 0;
int isexec = 0;
int pagesync_flag;
+ zoneid_t zid = ALL_ZONES;
/*
* Skip pages:
@@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand)
return (-1);
}
+ if (zones_over) {
+ ASSERT(pp->p_zoneid == ALL_ZONES ||
+ pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
+ if (pp->p_zoneid == ALL_ZONES ||
+ zone_pdata[pp->p_zoneid].zpers_over == 0) {
+ /*
+ * Cross-zone shared page, or zone not over it's cap.
+ * Leave the page alone.
+ */
+ page_unlock(pp);
+ return (-1);
+ }
+ zid = pp->p_zoneid;
+ }
+
/*
* Maintain statistics for what we are freeing
*/
@@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand)
recheck:
/*
- * If page is referenced; make unreferenced but reclaimable.
- * If this page is not referenced, then it must be reclaimable
- * and we can add it to the free list.
+ * If page is referenced; fronthand makes unreferenced and reclaimable.
+ * For the backhand, a process referenced the page since the front hand
+ * went by, so it's not a candidate for freeing up.
*/
if (ppattr & P_REF) {
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
- "pageout_isref:pp %p whichhand %d", pp, whichhand);
+ DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
if (whichhand == FRONT) {
- /*
- * Checking of rss or madvise flags needed here...
- *
- * If not "well-behaved", fall through into the code
- * for not referenced.
- */
hat_clrref(pp);
}
- /*
- * Somebody referenced the page since the front
- * hand went by, so it's not a candidate for
- * freeing up.
- */
page_unlock(pp);
return (0);
}
+ /*
+ * This page is not referenced, so it must be reclaimable and we can
+ * add it to the free list. This can be done by either hand.
+ */
+
VM_STAT_ADD(pageoutvmstats.checkpage[0]);
/*
@@ -1073,8 +1337,9 @@ recheck:
u_offset_t offset = pp->p_offset;
/*
- * XXX - Test for process being swapped out or about to exit?
- * [Can't get back to process(es) using the page.]
+ * Note: There is no possibility to test for process being
+ * swapped out or about to exit since we can't get back to
+ * process(es) from the page.
*/
/*
@@ -1092,6 +1357,11 @@ recheck:
VN_RELE(vp);
return (0);
}
+ if (isfs) {
+ zone_pageout_stat(zid, ZPO_DIRTY);
+ } else {
+ zone_pageout_stat(zid, ZPO_ANONDIRTY);
+ }
return (1);
}
@@ -1102,8 +1372,7 @@ recheck:
* the pagesync but before it was unloaded we catch it
* and handle the page properly.
*/
- TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
- "pageout_free:pp %p whichhand %d", pp, whichhand);
+ DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
ppattr = hat_page_getattr(pp, P_MOD | P_REF);
if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
@@ -1120,8 +1389,10 @@ recheck:
} else {
CPU_STATS_ADD_K(vm, fsfree, 1);
}
+ zone_pageout_stat(zid, ZPO_FS);
} else {
CPU_STATS_ADD_K(vm, anonfree, 1);
+ zone_pageout_stat(zid, ZPO_ANON);
}
return (1); /* freed a page! */
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index 4664c52e77..1b027b4409 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1621,7 +1621,7 @@ vmem_destroy(vmem_t *vmp)
leaked = vmem_size(vmp, VMEM_ALLOC);
if (leaked != 0)
- cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+ cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
"identifiers" : "bytes");
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index e89cf2c06d..ebde0d7850 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -106,14 +106,16 @@
* removed from the list of active zones. zone_destroy() returns, and
* the zone can be recreated.
*
- * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
- * callbacks are executed, and all memory associated with the zone is
- * freed.
+ * ZONE_IS_FREE (internal state): All references have been dropped and
+ * the zone_t is no longer in the zone_active nor zone_deathrow lists.
+ * The zone_t is in the process of being freed. This state exists
+ * only for publishing a sysevent to indicate that the zone by this
+ * name can be booted again.
*
- * Threads can wait for the zone to enter a requested state by using
- * zone_status_wait() or zone_status_timedwait() with the desired
- * state passed in as an argument. Zone state transitions are
- * uni-directional; it is not possible to move back to an earlier state.
+ * Threads can wait for the zone to enter a requested state (other than
+ * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
+ * with the desired state passed in as an argument. Zone state transitions
+ * are uni-directional; it is not possible to move back to an earlier state.
*
*
* Zone-Specific Data:
@@ -252,6 +254,8 @@
#include <sys/cpucaps.h>
#include <vm/seg.h>
#include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
/*
* This constant specifies the number of seconds that threads waiting for
@@ -312,6 +316,7 @@ static id_space_t *zoneid_space;
* 'global_zone'.
*/
zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
/*
@@ -327,8 +332,8 @@ static list_t zone_active;
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
/* Event channel to sent zone state change notifications */
evchan_t *zone_event_chan;
@@ -350,6 +355,7 @@ const char *zone_status_table[] = {
ZONE_EVENT_SHUTTING_DOWN, /* down */
ZONE_EVENT_SHUTTING_DOWN, /* dying */
ZONE_EVENT_UNINITIALIZED, /* dead */
+ ZONE_EVENT_FREE, /* free */
};
/*
@@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t);
static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
static int zone_set_network(zoneid_t, zone_net_data_t *);
static int zone_get_network(zoneid_t, zone_net_data_t *);
+static void zone_status_set(zone_t *, zone_status_t);
typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
@@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
+
+/*
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ * 1) pages and RSS data associated with processes inside a zone
+ * 2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ * associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ * instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpers_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
+ * the "zone_pdata" array and associated counter.
+ *
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
+ */
+uint_t zone_num_over_cap;
+zone_persist_t zone_pdata[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+ rctl_qty_t r = 0;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ return (r);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+ zone_persist_t *zp;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zp = &zone_pdata[zone->zone_id];
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+ mutex_exit(&zp->zpers_zfs_lock);
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ q = ptob(zp->zpers_pg_cnt);
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zoneid_t zid;
+ uint_t pg_val;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ zid = e->rcep_p.zone->zone_id;
+ if (nv == UINT64_MAX) {
+ pg_val = UINT32_MAX;
+ } else {
+ uint64_t pages = btop(nv);
+
+ /*
+ * Return from RCTLOP_SET is always ignored so just clamp an
+ * out-of-range value to our largest "limited" value.
+ */
+ if (pages >= UINT32_MAX) {
+ pg_val = UINT32_MAX - 1;
+ } else {
+ pg_val = (uint_t)pages;
+ }
+ }
+ zone_pdata[zid].zpers_pg_limit = pg_val;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp == NULL) {
+ zzp->zz_nread.value.ui64 = 0;
+ zzp->zz_reads.value.ui64 = 0;
+ zzp->zz_rtime.value.ui64 = 0;
+ zzp->zz_rlentime.value.ui64 = 0;
+ zzp->zz_nwritten.value.ui64 = 0;
+ zzp->zz_writes.value.ui64 = 0;
+ zzp->zz_waittime.value.ui64 = 0;
+ } else {
+ kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure
+ * used by kstat_runq_enter() and related functions. Since the
+ * I/O throttle counters are updated directly by the ZFS layer,
+ * there's no need to copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+ zzp->zz_waittime.value.ui64 =
+ zp->zpers_zfsp->zpers_zfs_rd_waittime;
+ }
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
static int
zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
+ zone_persist_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
+ zp = &zone_pdata[zone->zone_id];
+
+ zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zp->zpers_nover;
+#ifndef DEBUG
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
+#else
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+ zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
+#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone)
/* The kstat "name" field is not large enough for a full zonename */
kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
@@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
+ zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
+ zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
return (0);
@@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone)
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
ksp->ks_update = zone_misc_kstat_update;
@@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone)
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
zone->zone_mcap_stats = kmem_zalloc(
sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone)
sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_swapresv_kstat,
sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_nprocs_kstat,
sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
zone_kstat_delete_common(&zone->zone_mcap_ksp,
sizeof (zone_mcap_kstat_t));
zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2101,8 +2579,12 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
+ zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+ zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2209,6 +2691,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 16384, 16384, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2250,6 +2747,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2260,6 +2771,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2281,6 +2797,9 @@ zone_init(void)
zone0.zone_ntasks = 1;
mutex_exit(&p0.p_lock);
zone0.zone_restart_init = B_TRUE;
+ zone0.zone_reboot_on_init_exit = B_FALSE;
+ zone0.zone_restart_init_0 = B_FALSE;
+ zone0.zone_init_status = -1;
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
@@ -2362,6 +2881,8 @@ zone_init(void)
static void
zone_free(zone_t *zone)
{
+ zone_dl_t *zdl;
+
ASSERT(zone != global_zone);
ASSERT(zone->zone_ntasks == 0);
ASSERT(zone->zone_nlwps == 0);
@@ -2377,6 +2898,9 @@ zone_free(zone_t *zone)
*/
cpucaps_zone_remove(zone);
+ /* Clear physical memory capping data. */
+ bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
+
ASSERT(zone->zone_cpucap == NULL);
/* remove from deathrow list */
@@ -2390,8 +2914,30 @@ zone_free(zone_t *zone)
list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
+
+ /*
+ * While dlmgmtd should have removed all of these, it could have left
+ * something behind or crashed. In which case it's not safe for us to
+ * assume that the list is empty which list_destroy() will ASSERT. We
+ * clean up for our userland comrades which may have crashed, or worse,
+ * been disabled by SMF.
+ */
+ while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+ if (zdl->zdl_net != NULL)
+ nvlist_free(zdl->zdl_net);
+ kmem_free(zdl, sizeof (zone_dl_t));
+ }
list_destroy(&zone->zone_dl_list);
+ /*
+ * This zone_t can no longer inhibit creation of another zone_t
+ * with the same name or debug ID. Generate a sysevent so that
+ * userspace tools know it is safe to carry on.
+ */
+ mutex_enter(&zone_status_lock);
+ zone_status_set(zone, ZONE_IS_FREE);
+ mutex_exit(&zone_status_lock);
+
cpu_uarray_free(zone->zone_ustate);
if (zone->zone_rootvp != NULL)
@@ -2436,11 +2982,17 @@ zone_free(zone_t *zone)
static void
zone_status_set(zone_t *zone, zone_status_t status)
{
+ timestruc_t now;
+ uint64_t t;
nvlist_t *nvl = NULL;
ASSERT(MUTEX_HELD(&zone_status_lock));
- ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
- status >= zone_status_get(zone));
+ ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
+ status == ZONE_IS_FREE) && status >= zone_status_get(zone));
+
+ /* Current time since Jan 1 1970 but consumers expect NS */
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
@@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status)
nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
zone_status_table[zone->zone_status]) ||
nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
- nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
#ifdef DEBUG
(void) printf(
"Failed to allocate and send zone state change event.\n");
+#else
+ /* EMPTY */
#endif
}
nvlist_free(nvl);
@@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone)
return (zone->zone_status);
}
+/*
+ * Publish a zones-related sysevent for purposes other than zone state changes.
+ * While it is unfortunate that zone_event_chan is associated with
+ * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
+ * the only ones with class "status" and subclass "change".
+ */
+void
+zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
+ nvlist_t *ev_nvl)
+{
+ nvlist_t *nvl = NULL;
+ timestruc_t now;
+ uint64_t t;
+
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
+ if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
+ nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
+ nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
+ sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
+ "kernel", nvl, EVCH_SLEEP) != 0) {
+#ifdef DEBUG
+ (void) printf("Failed to allocate and send zone misc event.\n");
+#else
+ /* EMPTY */
+#endif
+ }
+ nvlist_free(nvl);
+}
+
static int
zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
{
@@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand)
return (EINVAL);
}
- /* set up the brand specific data */
+ /*
+ * Set up the brand specific data.
+ * Note that it's possible that the hook has to drop the
+ * zone_status_lock and reaquire it before returning so we can't
+ * assume the lock has been held the entire time.
+ */
zone->zone_brand = bp;
- ZBROP(zone)->b_init_brand_data(zone);
+ ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
mutex_exit(&zone_status_lock);
return (0);
@@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
}
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
-{
- uint64_t mcap;
- int err = 0;
-
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
-
- return (err);
-}
-
-static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
char sched_class[PC_CLNMSZ];
@@ -3020,6 +3599,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -3766,6 +4351,17 @@ zone_start_init(void)
*/
z->zone_proc_initpid = p->p_pid;
+ if (z->zone_setup_app_contract == B_TRUE) {
+ /*
+ * Normally a process cannot modify its own contract, but we're
+ * just starting the zone's init process and its contract is
+ * always initialized from the sys_process_tmpl template, so
+ * this is the simplest way to setup init's contract to kill
+ * the process if any other process in the contract exits.
+ */
+ p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+ }
+
/*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
@@ -3794,9 +4390,54 @@ zone_start_init(void)
lwp_exit();
}
} else {
+ id_t cid = curthread->t_cid;
+
if (zone_status_get(z) == ZONE_IS_BOOTING)
zone_status_set(z, ZONE_IS_RUNNING);
mutex_exit(&zone_status_lock);
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ /*
+ * If the zone is using FX then by default all
+ * processes start at the lowest priority and stay
+ * there. We provide a mechanism for the zone to
+ * indicate that it should run at "high priority". In
+ * this case we setup init to run at the highest FX
+ * priority (which is one level higher than the
+ * non-fixed scheduling classes can use).
+ */
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ (void) parmsset(&pcparms, curthread);
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ /*
+ * zsched always starts the init lwp at priority
+ * minclsyspri - 1. This priority gets set in t_pri and
+ * is invalid for RT, but RT never uses t_pri. However
+ * t_pri is used by procfs, so we always see processes
+ * within an RT zone with an invalid priority value.
+ * We fix that up now.
+ */
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
/* cause the process to return to userland. */
lwp_rtt();
}
@@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
error = EINVAL;
name = nvpair_name(nvp);
- if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
- != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+ if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+ strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+ nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zone->zone_id = zoneid;
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
zone->zone_restart_init = B_TRUE;
+ zone->zone_reboot_on_init_exit = B_FALSE;
+ zone->zone_restart_init_0 = B_FALSE;
+ zone->zone_init_status = -1;
zone->zone_brand = &native_brand;
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_max_swap_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+
+ zone_pdata[zoneid].zpers_zfsp =
+ kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+ zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
@@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root,
*/
zone->zone_rctls = NULL;
+ /*
+ * Ensure page count is 0 (in case zoneid has wrapped).
+ * Initialize physical memory cap as unlimited.
+ */
+ zone_pdata[zoneid].zpers_pg_cnt = 0;
+ zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
+
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
return (zone_create_error(error, 0, extended_error));
@@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root,
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
- * and initialize zsched appropriately. I'm not sure that that
- * makes much of a difference, though.
+ * and initialize zsched appropriately. However, we allow zoneadmd
+ * to pass down both zone and project rctls for the zone's init.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
@@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid)
zone_status_t status;
clock_t wait_time;
boolean_t log_refcounts;
+ zone_persist_t *zp;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ zp = &zone_pdata[zoneid];
+ mutex_enter(&zp->zpers_zfs_lock);
+ kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+ zp->zpers_zfsp = NULL;
+ mutex_exit(&zp->zpers_zfs_lock);
+
/*
* wait for zsched to exit
*/
@@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ size = sizeof (boolean_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+ bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * No attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID) {
return (set_errno(EINVAL));
}
@@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
mutex_exit(&zonehash_lock);
/*
- * At present most attributes can only be set on non-running,
+ * At present attributes can only be set on non-running,
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
zone->zone_restart_init = B_FALSE;
err = 0;
break;
+ case ZONE_ATTR_INITRESTART0:
+ zone->zone_restart_init_0 = B_TRUE;
+ err = 0;
+ break;
+ case ZONE_ATTR_INITREBOOT:
+ zone->zone_reboot_on_init_exit = B_TRUE;
+ err = 0;
+ break;
case ZONE_ATTR_BOOTARGS:
err = zone_set_bootargs(zone, (const char *)buf);
break;
@@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
@@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_APP_SVC_CT:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_setup_app_contract = (boolean_t)buf;
+ err = 0;
+ }
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_fixed_hipri = (boolean_t)buf;
+ err = 0;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
bcopy(zone->zone_name, zone_name, zone_namelen);
zoneid = zone->zone_id;
uniqid = zone->zone_uniqid;
+ arg.status = zone->zone_init_status;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
@@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
@@ -6837,16 +7543,15 @@ zone_shutdown_global(void)
}
/*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
- zone_t *zone = curproc->p_zone;
const char *name = NULL;
vfs_t *vfsp = NULL;
@@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write)
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
- ASSERT(vfsp);
+ if (vfsp == NULL)
+ break;
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
@@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write)
}
/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_t *zone = curproc->p_zone;
+
+ return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
@@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
zone_t *thiszone;
+ /*
+ * Only the GZ may add a datalink to a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may add a
+ * datalink to a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * When links exist in the GZ, they aren't added to the GZ's
+ * zone_dl_list. We must enforce this because link_activate()
+ * depends on zone_check_datalink() returning only NGZs.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((thiszone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(ENXIO));
@@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
int err = 0;
+ /*
+ * Only the GZ may remove a datalink from a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may remove a
+ * datalink from a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * If we can't add a datalink to the GZ's zone_dl_list then we
+ * certainly can't remove them either.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((zone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(EINVAL));
@@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
}
/*
- * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
- * the linkid. Otherwise we just check if the specified zoneidp has been
- * assigned the supplied linkid.
+ *
+ * This function may be used in two ways:
+ *
+ * 1. to get the zoneid of the zone this link is under, or
+ *
+ * 2. to verify that the link is under a specific zone.
+ *
+ * The first use is achieved by passing a zoneid of ALL_ZONES. The
+ * function then iterates the datalink list of every zone on the
+ * system until it finds the linkid. If the linkid is found then the
+ * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
+ * returned and zoneidp is not modified. The use of ALL_ZONES is
+ * limited to callers in the GZ to prevent leaking information to
+ * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
+ * to the second type in the list above.
+ *
+ * The second use is achieved by passing a specific zoneid. The GZ can
+ * use this to verify a link is under a particular zone. An NGZ can
+ * use this to verify a link is under itself. But an NGZ cannot use
+ * this to determine if a link is under some other zone as that would
+ * result in information leakage. If the link exists under the zone
+ * then 0 is returned. Otherwise, ENXIO is returned.
*/
int
zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
{
zone_t *zone;
+ zoneid_t zoneid = *zoneidp;
+ zoneid_t caller = getzoneid();
int err = ENXIO;
- if (*zoneidp != ALL_ZONES) {
- if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
- if (zone_dl_exists(zone, linkid))
+ /*
+ * Only the GZ may enquire about all zones; an NGZ may only
+ * enuqire about itself.
+ */
+ if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
+ zoneid = caller;
+
+ if (zoneid != caller && caller != GLOBAL_ZONEID)
+ return (err);
+
+ if (zoneid != ALL_ZONES) {
+ if ((zone = zone_find_by_id(zoneid)) != NULL) {
+ if (zone_dl_exists(zone, linkid)) {
+ /*
+ * We need to set this in case an NGZ
+ * passes ALL_ZONES.
+ */
+ *zoneidp = zoneid;
err = 0;
+ }
zone_rele(zone);
}
return (err);
}
+ ASSERT(caller == GLOBAL_ZONEID);
mutex_enter(&zonehash_lock);
for (zone = list_head(&zone_active); zone != NULL;
zone = list_next(&zone_active, zone)) {
@@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
}
}
mutex_exit(&zonehash_lock);
+
return (err);
}
@@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
zone_dl_t *zdl;
datalink_id_t *idptr = idarray;
+ /*
+ * Only the GZ or the owning zone may look at the datalink list.
+ */
+ if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
+ return (set_errno(EPERM));
+
if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
return (set_errno(EFAULT));
if ((zone = zone_find_by_id(zoneid)) == NULL)
@@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
mutex_exit(&zone->zone_lock);
zone_rele(zone);
+ /*
+ * Prevent returning negative nump values -- we should never
+ * have this many links anyways.
+ */
+ if (num > INT_MAX)
+ return (set_errno(EOVERFLOW));
+
/* Increased or decreased, caller should be notified. */
if (num != dlcount) {
if (copyout(&num, nump, sizeof (num)) != 0)
@@ -7388,3 +8199,231 @@ done:
else
return (0);
}
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+
+ /* See if over (unlimited is UINT32_MAX), or already marked that way. */
+ if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+ zp->zpers_over = 1;
+ zp->zpers_nover++;
+ zone_num_over_cap++;
+ DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ * cap pages pages 1% shift7 shift7
+ * 128M 32768 0x0008000 327 256 0x00100
+ * 512M 131072 0x0020000 1310 1024 0x00400
+ * 1G 262144 0x0040000 2621 2048 0x00800
+ * 4G 1048576 0x0100000 10485 8192 0x02000
+ * 8G 2097152 0x0200000 20971 16384 0x04000
+ * 16G 4194304 0x0400000 41943 32768 0x08000
+ * 32G 8388608 0x0800000 83886 65536 0x10000
+ * 64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+ uint32_t adjusted_limit;
+
+ /*
+ * See if under, or already marked that way. There is no need to
+ * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+ * since we'll never set zpers_over in zone_incr_capped().
+ */
+ if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
+ return;
+ }
+
+ adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
+
+ /* Recheck, accounting for our hysteresis. */
+ if (zp->zpers_pg_cnt >= adjusted_limit) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck under mutex. */
+ if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+ zp->zpers_over = 0;
+ ASSERT(zone_num_over_cap > 0);
+ zone_num_over_cap--;
+ DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ ASSERT(!PP_ISFREE(pp));
+
+ zid = curzone->zone_id;
+ if (pp->p_zoneid == zid) {
+ /* Another mapping to this page for this zone, do nothing */
+ return;
+ }
+
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = page_get_pagecnt(pp->p_szc);
+ }
+
+ if (pp->p_share == 0) {
+ /* First mapping to this page. */
+ pp->p_zoneid = zid;
+ zp = &zone_pdata[zid];
+ ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
+ zone_incr_capped(zid);
+ return;
+ }
+
+ if (pp->p_zoneid != ALL_ZONES) {
+ /*
+ * The page is now being shared across a different zone.
+ * Decrement the original zone's usage.
+ */
+ zid = pp->p_zoneid;
+ pp->p_zoneid = ALL_ZONES;
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ }
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ zid = pp->p_zoneid;
+ if (zid == ALL_ZONES || pp->p_share != 0)
+ return;
+
+ /* This is the last mapping to the page for a zone. */
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+ zone_persist_t *zp;
+
+ if (zid == ALL_ZONES)
+ return;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+#ifndef DEBUG
+ atomic_add_64(&zp->zpers_pg_out, 1);
+#else
+ switch (op) {
+ case ZPO_DIRTY:
+ atomic_add_64(&zp->zpers_pg_fsdirty, 1);
+ break;
+ case ZPO_FS:
+ atomic_add_64(&zp->zpers_pg_fs, 1);
+ break;
+ case ZPO_ANON:
+ atomic_add_64(&zp->zpers_pg_anon, 1);
+ break;
+ case ZPO_ANONDIRTY:
+ atomic_add_64(&zp->zpers_pg_anondirty, 1);
+ break;
+ default:
+ cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+ break;
+ }
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+ zone_persist_t *zp;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ /*
+ * If memory or swap limits are set on the zone, use those, otherwise
+ * use the system values. physmem and freemem are also in pages.
+ */
+ if (zp->zpers_pg_limit == UINT32_MAX) {
+ *memcap = physmem;
+ *free = freemem;
+ } else {
+ int64_t freemem;
+
+ *memcap = (pgcnt_t)zp->zpers_pg_limit;
+ freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
+ if (freemem > 0) {
+ *free = (pgcnt_t)freemem;
+ } else {
+ *free = (pgcnt_t)0;
+ }
+ }
+}