37 files changed, 2812 insertions, 597 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c
index e598e0d08d..891c4e0836 100644
--- a/usr/src/uts/common/os/acct.c
+++ b/usr/src/uts/common/os/acct.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -47,6 +48,7 @@
 #include <sys/time.h>
 #include <sys/msacct.h>
 #include <sys/zone.h>
+#include <sys/brand.h>
 
 /*
  * Each zone has its own accounting settings (on or off) and associated
@@ -373,7 +375,7 @@ acct_compress(ulong_t t)
  * On exit, write a record on the accounting file.
  */
 void
-acct(char st)
+acct(int st)
 {
 	struct vnode *vp;
 	struct cred *cr;
@@ -402,6 +404,21 @@ acct(char st)
 	 * This only gets called from exit after all lwp's have exited so no
 	 * cred locking is needed.
 	 */
+
+	/* If there is a brand-specific hook, use it instead */
+	if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) {
+		ZBROP(curzone)->b_acct_out(vp, st);
+		mutex_exit(&ag->aclock);
+		return;
+	}
+
+	/*
+	 * The 'st' status value was traditionally masked this way by our
+	 * caller, but we now accept the unmasked value for brand handling.
+	 * Zones not using the brand hook mask the status here.
+	 */
+	st &= 0xff;
+
 	p = curproc;
 	ua = PTOU(p);
 	bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm));
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 8b3177b916..fa3555a82a 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {
 };
 #else /* !__sparcv9 */
 struct brand_mach_ops native_mach_ops  = {
-		NULL, NULL, NULL, NULL
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL
 };
 #endif /* !__sparcv9 */
 
@@ -53,7 +54,8 @@ brand_t native_brand = {
 		BRAND_VER_1,
 		"native",
 		NULL,
-		&native_mach_ops
+		&native_mach_ops,
+		0
 };
 
 /*
@@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)
 	mutex_exit(&brand_list_lock);
 }
 
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
+	void *brand_data = NULL;
 
-	ASSERT(bp != NULL);
-	ASSERT(p->p_brand == &native_brand);
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
 
 	/*
-	 * We should only be called from exec(), when we know the process
-	 * is single-threaded.
+	 * Process branding occurs during fork() and exec().  When it happens
+	 * during fork(), the LWP count will always be 0 since branding is
+	 * performed as part of getproc(), before LWPs have been associated.
+	 * The same is not true during exec(), where a multi-LWP process may
+	 * undergo branding just prior to gexec(). This is to ensure
+	 * exec-related brand hooks are available.  While it may seem
+	 * complicated to brand a multi-LWP process, the two possible outcomes
+	 * simplify things:
+	 *
+	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and
+	 *    any further branding will occur in a single-LWP context.
+	 * 2. The exec() fails: The process will be promptly unbranded since
+	 *    the hooks are no longer needed.
+	 *
+	 * To prevent inconsistent brand state from being encountered during
+	 * the exec(), LWPs beyond the caller which are associated with this
+	 * process must be held temporarily.  They will be released either when
+	 * they are killed in the exec() success, or when the brand is cleared
+	 * after exec() failure.
 	 */
-	ASSERT(p->p_tlist == p->p_tlist->t_forw);
+	if (lwps_ok) {
+		/*
+		 * We've been called from a exec() context tolerating the
+		 * existence of multiple LWPs during branding is necessary.
+		 */
+		VERIFY(p == curproc);
+		VERIFY(p->p_tlist != NULL);
 
+		if (p->p_tlist != p->p_tlist->t_forw) {
+			/*
+			 * Multiple LWPs are present.  Hold all but the caller.
+			 */
+			if (!holdlwps(SHOLDFORK1)) {
+				return (-1);
+			}
+		}
+	} else {
+		/*
+		 * Processes branded during fork() should not have LWPs at all.
+		 */
+		VERIFY(p->p_tlist == NULL);
+	}
+
+	if (bp->b_data_size > 0) {
+		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+	}
+
+	mutex_enter(&p->p_lock);
+	ASSERT(!PROC_IS_BRANDED(p));
 	p->p_brand = bp;
+	p->p_brand_data = brand_data;
 	ASSERT(PROC_IS_BRANDED(p));
 	BROP(p)->b_setbrand(p);
+	mutex_exit(&p->p_lock);
+	return (0);
 }
 
 void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
-	klwp_t *lwp = NULL;
-	ASSERT(bp != NULL);
-	ASSERT(!no_lwps || (p->p_tlist == NULL));
+	void *brand_data;
 
-	/*
-	 * If called from exec_common() or proc_exit(),
-	 * we know the process is single-threaded.
-	 * If called from fork_fail, p_tlist is NULL.
-	 */
-	if (!no_lwps) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		lwp = p->p_tlist->t_lwp;
-	}
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
+	VERIFY(PROC_IS_BRANDED(p));
 
-	ASSERT(PROC_IS_BRANDED(p));
-	BROP(p)->b_proc_exit(p, lwp);
+	if (BROP(p)->b_clearbrand != NULL)
+		BROP(p)->b_clearbrand(p, lwps_ok);
+
+	mutex_enter(&p->p_lock);
 	p->p_brand = &native_brand;
+	brand_data = p->p_brand_data;
+	p->p_brand_data = NULL;
+
+	if (lwps_ok) {
+		VERIFY(p == curproc);
+		/*
+		 * A process with multiple LWPs is being de-branded after
+		 * failing an exec.  The other LWPs were held as part of the
+		 * procedure, so they must be resumed now.
+		 */
+		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+			continuelwps(p);
+		}
+	} else {
+		/*
+		 * While clearing the brand, it's ok for one LWP to be present.
+		 * This happens when a native binary is executed inside a
+		 * branded zone, since the brand will be removed during the
+		 * course of a successful exec.
+		 */
+		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+	}
+	mutex_exit(&p->p_lock);
+
+	if (brand_data != NULL) {
+		kmem_free(brand_data, bp->b_data_size);
+	}
 }
 
 #if defined(__sparcv9)
@@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
 		return (ENOSYS);
 
 	/* For all other operations this must be a branded process. */
-	if (p->p_brand == &native_brand)
+	if (!PROC_IS_BRANDED(p))
 		return (ENOSYS);
 
 	ASSERT(p->p_brand == pbrand);
@@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
 /*ARGSUSED*/
 int
 brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
-    intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
-    cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
-    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+    intpdata_t *idatap, int level, size_t *execsz, int setid,
+    caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand,
+    char *bname, char *brandlib, char *brandlib32)
 {
 
 	vnode_t		*nvp;
 	Ehdr		ehdr;
 	Addr		uphdr_vaddr;
 	intptr_t	voffset;
-	int		interp;
+	char		*interp;
 	int		i, err;
 	struct execenv	env;
 	struct execenv	origenv;
@@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	klwp_t		*lwp = ttolwp(curthread);
 	brand_proc_data_t	*spd;
 	brand_elf_data_t sed, *sedp;
-	char		*linker;
 	uintptr_t	lddata; /* lddata of executable's linker */
 
 	ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	 */
 	if (args->to_model == DATAMODEL_NATIVE) {
 		args->emulator = brandlib;
-		linker = brandlinker;
 	}
 #if defined(_LP64)
 	else {
 		args->emulator = brandlib32;
-		linker = brandlinker32;
 	}
 #endif  /* _LP64 */
 
@@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	if (args->to_model == DATAMODEL_NATIVE) {
 		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 	}
 #if defined(_LP64)
 	else {
@@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		Elf32_Addr uphdr_vaddr32;
 		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 		Ehdr32to64(&ehdr32, &ehdr);
 
 		if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 #endif  /* _LP64 */
 	if (err != 0) {
 		restoreexecenv(&origenv, &orig_sigaltstack);
+
+		if (interp != NULL)
+			kmem_free(interp, MAXPATHLEN);
+
 		return (err);
 	}
 
@@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	sedp->sed_phent = ehdr.e_phentsize;
 	sedp->sed_phnum = ehdr.e_phnum;
 
-	if (interp) {
+	if (interp != NULL) {
 		if (ehdr.e_type == ET_DYN) {
 			/*
 			 * This is a shared object executable, so we
@@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		 * it in and store relevant information about it in the
 		 * aux vector, where the brand library can find it.
 		 */
-		if ((err = lookupname(linker, UIO_SYSSPACE,
+		if ((err = lookupname(interp, UIO_SYSSPACE,
 		    FOLLOW, NULLVPP, &nvp)) != 0) {
-			uprintf("%s: not found.", brandlinker);
+			uprintf("%s: not found.", interp);
 			restoreexecenv(&origenv, &orig_sigaltstack);
+			kmem_free(interp, MAXPATHLEN);
 			return (err);
 		}
+
+		kmem_free(interp, MAXPATHLEN);
+
 		if (args->to_model == DATAMODEL_NATIVE) {
 			err = mapexec_brand(nvp, args, &ehdr,
 			    &uphdr_vaddr, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 		}
 #if defined(_LP64)
 		else {
@@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			Elf32_Addr uphdr_vaddr32;
 			err = mapexec32_brand(nvp, args, &ehdr32,
 			    &uphdr_vaddr32, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 			Ehdr32to64(&ehdr32, &ehdr);
 
 			if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 
 	/*
 	 * Third, the /proc aux vectors set up by elfexec() point to
-	 * brand emulation library and it's linker.  Copy these to the
+	 * brand emulation library and its linker.  Copy these to the
 	 * /proc brand specific aux vector, and update the regular
-	 * /proc aux vectors to point to the executable (and it's
+	 * /proc aux vectors to point to the executable (and its
 	 * linker).  This will enable debuggers to access the
 	 * executable via the usual /proc or elf notes aux vectors.
 	 *
@@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
 }
 
 /*ARGSUSED*/
-int
+void
 brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
 {
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand == NULL);
 	l->lwp_brand = (void *)-1;
-	return (0);
 }
 
 /*ARGSUSED*/
 void
 brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
 {
-	proc_t  *p = l->lwp_procp;
-
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand != NULL);
-
-	/*
-	 * We should never be called for the last thread in a process.
-	 * (That case is handled by brand_solaris_proc_exit().)
-	 * Therefore this lwp must be exiting from a multi-threaded
-	 * process.
-	 */
-	ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
-	l->lwp_brand = NULL;
 }
 
 /*ARGSUSED*/
 void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
 {
 	ASSERT(p->p_brand == pbrand);
 	ASSERT(p->p_brand_data != NULL);
 
-	/*
-	 * When called from proc_exit(), we know that process is
-	 * single-threaded and free our lwp brand data.
-	 * otherwise just free p_brand_data and return.
-	 */
-	if (l != NULL) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		ASSERT(p->p_tlist->t_lwp == l);
-		(void) brand_solaris_freelwp(l, pbrand);
-	}
-
 	/* upon exit, free our proc brand data */
 	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
 	p->p_brand_data = NULL;
@@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
 	ASSERT(p->p_tlist == p->p_tlist->t_forw);
 
 	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
-	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
 }
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 9e498dc1c7..e4b1db84e1 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 /*
  * Copyright (c) 2017 by Delphix. All rights reserved.
@@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
 	avl_index_t where;
 	klwp_t *curlwp = ttolwp(curthread);
 
-	ASSERT(author == curproc);
+	/*
+	 * It's possible that author is not curproc if the zone is creating
+	 * a new process as a child of zsched.
+	 */
 
 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d5e272c16a..a147b1cf0f 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2019 Joyent Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
@@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)
 	/*
 	 * Determine what rootvp to use.
 	 */
+	mutex_enter(&curproc->p_lock);
 	if (core_type == CORE_PROC) {
 		rootvp = (PTOU(curproc)->u_rdir == NULL ?
 		    curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir);
@@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)
 	VN_HOLD(startvp);
 	if (rootvp != rootdir)
 		VN_HOLD(rootvp);
+	mutex_exit(&curproc->p_lock);
 	if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,
 	    startvp, CRED())) != 0) {
 		pn_free(&pn);
@@ -793,7 +795,7 @@ clock_t	core_delay_usec = 10000;
  * using core_write() below, and so it has the same failure semantics.
  */
 int
-core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
+core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size,
     rlim64_t rlimit, cred_t *credp)
 {
 	caddr_t eaddr;
@@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
 	size_t len;
 	int err = 0;
 
+	if (offset > OFF_MAX || offset + size > OFF_MAX ||
+	    offset + size < offset) {
+		return (EOVERFLOW);
+	}
+
 	eaddr = addr + size;
 	for (base = addr; base < eaddr; base += len) {
 		len = eaddr - base;
@@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
  * unexpectedly returns zero but no progress has been made, we return ENOSPC.
  */
 int
-core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset,
+core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset,
     const void *buf, size_t len, rlim64_t rlimit, cred_t *credp)
 {
 	ssize_t resid = len;
 	int error = 0;
 
+	if (offset > OFF_MAX || offset + len > OFF_MAX ||
+	    offset + len < offset) {
+		return (EOVERFLOW);
+	}
+
 	while (len != 0) {
-		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset,
-		    segflg, 0, rlimit, credp, &resid);
+		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len,
+		    (offset_t)offset, segflg, 0, rlimit, credp, &resid);
 
 		if (error != 0)
 			break;
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 075bb6e70a..6a86dbb8cb 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -112,7 +112,7 @@ cpu_t		*cpu_list;		/* list of all CPUs */
 cpu_t		*clock_cpu_list;	/* used by clock to walk CPUs */
 cpu_t		*cpu_active;		/* list of active CPUs */
 cpuset_t	cpu_active_set;		/* cached set of active CPUs */
-static cpuset_t	cpu_available;		/* set of available CPUs */
+cpuset_t	cpu_available;		/* set of available CPUs */
 cpuset_t	cpu_seqid_inuse;	/* which cpu_seqids are in use */
 
 cpu_t		**cpu_seq;		/* ptrs to CPUs, indexed by seq_id */
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 3e1df330b7..5e909667de 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -730,6 +730,14 @@ crgetzoneid(const cred_t *cr)
 	    cr->cr_zone->zone_id);
 }
 
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+	return (cr->cr_zone == NULL ?
+	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+	    cr->cr_zone->zone_did);
+}
+
 projid_t
 crgetprojid(const cred_t *cr)
 {
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index 8faa8fea8c..2433c504fc 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
 
 	/* Log callback errors */
 	if (ret != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
 		    ddi_driver_name(req_p->ireq_dip),
 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
 	}
diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c
index 484b2042e2..868ed9e5c4 100644
--- a/usr/src/uts/common/os/dumpsubr.c
+++ b/usr/src/uts/common/os/dumpsubr.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -75,6 +75,7 @@
 #include <sys/cpu.h>
 
 #include <bzip2/bzlib.h>
+#include <crypto/chacha/chacha.h>
 
 #define	ONE_GIG	(1024 * 1024 * 1024UL)
 
@@ -112,6 +113,8 @@ int		dump_timeout = 120;	/* timeout for dumping pages */
 int		dump_timeleft;		/* portion of dump_timeout remaining */
 int		dump_ioerr;		/* dump i/o error */
 int		dump_check_used;	/* enable check for used pages */
+uint8_t		dump_crypt_key[DUMP_CRYPT_KEYLEN]; /* dump encryption key */
+uint8_t		dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; /* dump nonce */
 char	    *dump_stack_scratch; /* scratch area for saving stack summary */
 
 /*
@@ -357,6 +360,7 @@ typedef struct dumpsync {
 	hrtime_t iotime;		/* time spent writing nwrite bytes */
 	hrtime_t iowait;		/* time spent waiting for output */
 	hrtime_t iowaitts;		/* iowait timestamp */
+	hrtime_t crypt;			/* time spent encrypting */
 	perpage_t perpage;		/* metrics */
 	perpage_t perpagets;
 	int dumpcpu;			/* master cpu */
@@ -435,6 +439,7 @@ typedef struct dumpbuf {
 	char	*cur;		/* dump write pointer */
 	char	*start;		/* dump buffer address */
 	char	*end;		/* dump buffer end */
+	char	*scratch;	/* scratch buffer */
 	size_t	size;		/* size of dumpbuf in bytes */
 	size_t	iosize;		/* best transfer size for device */
 } dumpbuf_t;
@@ -493,11 +498,16 @@ dumpbuf_resize(void)
 	if (new_size <= old_size)
 		return; /* no need to reallocate buffer */
 
-	new_buf = kmem_alloc(new_size, KM_SLEEP);
+	/*
+	 * Allocate thrice the size of buffer to allow for space for the stream
+	 * and its ciphertext should encryption be enabled (or become so).
+	 */
+	new_buf = kmem_alloc(new_size * 3, KM_SLEEP);
 	dumpbuf.size = new_size;
 	dumpbuf.start = new_buf;
 	dumpbuf.end = new_buf + new_size;
-	kmem_free(old_buf, old_size);
+	dumpbuf.scratch = dumpbuf.end + new_size;
+	kmem_free(old_buf, old_size * 3);
 }
 
 /*
@@ -1125,9 +1135,16 @@ dumphdr_init(void)
 		dumphdr->dump_pagesize = PAGESIZE;
 		dumphdr->dump_utsname = utsname;
 		(void) strcpy(dumphdr->dump_platform, platform);
+
+		/*
+		 * Allocate our buffer, assuring enough room for encryption
+		 * should it become configured.
+		 */
 		dumpbuf.size = dumpbuf_iosize(maxphys);
-		dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
+		dumpbuf.start = kmem_alloc(dumpbuf.size * 3, KM_SLEEP);
 		dumpbuf.end = dumpbuf.start + dumpbuf.size;
+		dumpbuf.scratch = dumpbuf.end + dumpbuf.size;
+
 		dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
 		dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
 		LOCK_INIT_HELD(&dumpcfg.helper_lock);
@@ -1317,6 +1334,41 @@ dumpfini(void)
 	dumppath = NULL;
 }
 
+static void
+dumpvp_encrypt(size_t size)
+{
+	size_t nelems = size / sizeof (uint64_t), i;
+	uint64_t *start = (uint64_t *)dumpbuf.start;
+	uint64_t *stream = (uint64_t *)dumpbuf.end;
+	uint64_t *crypt = (uint64_t *)dumpbuf.scratch;
+	uint64_t ctr = dumpbuf.vp_off >> DUMP_CRYPT_BLOCKSHIFT;
+	hrtime_t ts = gethrtime();
+	offset_t dumpoff = dumpbuf.vp_off;
+	chacha_ctx_t ctx;
+
+	/*
+	 * Our size should be 64-bit aligned and our offset must be aligned
+	 * to our crypto blocksize.
+	 */
+	ASSERT(!(size & (sizeof (uint64_t) - 1)));
+	ASSERT(!(dumpbuf.vp_off & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1)));
+
+	chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0);
+	chacha_ivsetup(&ctx, dump_crypt_nonce, (uint8_t *)&ctr);
+
+	for (i = 0; i < nelems; i++) {
+		stream[i] = dumpoff;
+		dumpoff += sizeof (uint64_t);
+	}
+
+	chacha_encrypt_bytes(&ctx, (uint8_t *)stream, (uint8_t *)crypt, size);
+
+	for (i = 0; i < nelems; i++)
+		start[i] ^= crypt[i];
+
+	dumpsync.crypt += gethrtime() - ts;
+}
+
 static offset_t
 dumpvp_flush(void)
 {
@@ -1328,6 +1380,17 @@ dumpvp_flush(void)
 		dump_ioerr = ENOSPC;
 		dumpbuf.vp_off = dumpbuf.vp_limit;
 	} else if (size != 0) {
+		/*
+		 * If our dump is encrypted and this is neither the initial
+		 * dump header nor the terminal dump header and metrics,
+		 * encrypt the buffer before writing it.
+		 */
+		if ((dump_conflags & DUMP_ENCRYPT) &&
+		    dumpbuf.vp_off > dumphdr->dump_start &&
+		    dumpbuf.vp_off < dumpbuf.vp_limit - DUMP_OFFSET) {
+			dumpvp_encrypt(size);
+		}
+
 		iotime = gethrtime();
 		dumpsync.iowait += iotime - dumpsync.iowaitts;
 		if (panicstr)
@@ -2618,6 +2681,7 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
 	P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
 	P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
 	P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
+	P("..crypt nsec,%lld\n", (u_longlong_t)ds->crypt);
 	P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
 	P("dumpbuf.size,%ld\n", dumpbuf.size);
 
@@ -2658,6 +2722,29 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
 }
 #endif	/* COLLECT_METRICS */
 
+CTASSERT(DUMP_CRYPT_HMACLEN <= sizeof (struct utsname));
+
+/*
+ * Mark the dump as encrypted and calculate our (crude) HMAC based on the
+ * dump_utsname.  (The purpose of the HMAC is to merely allow for incorrect
+ * keys to be quickly rejected.)
+ */
+void
+dumpsys_crypt(dumphdr_t *dumphdr, dump_crypt_t *dcrypt)
+{
+	chacha_ctx_t ctx;
+
+	dumphdr->dump_flags |= DF_ENCRYPTED;
+	bcopy(dump_crypt_nonce, dcrypt->dump_crypt_nonce, DUMP_CRYPT_NONCELEN);
+	dcrypt->dump_crypt_algo = DUMP_CRYPT_ALGO_CHACHA20;
+
+	chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0);
+	chacha_ivsetup(&ctx, dump_crypt_nonce, NULL);
+
+	chacha_encrypt_bytes(&ctx, (uint8_t *)&dumphdr->dump_utsname,
+	    (uint8_t *)&dcrypt->dump_crypt_hmac, DUMP_CRYPT_HMACLEN);
+}
+
 /*
  * Dump the system.
  */
@@ -2679,6 +2766,7 @@ dumpsys(void)
 	dumpmlw_t mlw;
 	dumpcsize_t datatag;
 	dumpdatahdr_t datahdr;
+	dump_crypt_t dcrypt;
 
 	if (dumpvp == NULL || dumphdr == NULL) {
 		uprintf("skipping system dump - no dump device configured\n");
@@ -2733,6 +2821,9 @@ dumpsys(void)
 	/* Make sure nodename is current */
 	bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
 
+	if (dump_conflags & DUMP_ENCRYPT)
+		dumpsys_crypt(dumphdr, &dcrypt);
+
 	/*
 	 * If this is a live dump, try to open a VCHR vnode for better
 	 * performance. We must take care to flush the buffer cache
@@ -2999,11 +3090,19 @@ dumpsys(void)
 	 */
 	dumpbuf.vp_off = dumphdr->dump_start;
 	dumpvp_write(dumphdr, sizeof (dumphdr_t));
+
+	if (dump_conflags & DUMP_ENCRYPT)
+		dumpvp_write(&dcrypt, sizeof (dump_crypt_t));
+
 	(void) dumpvp_flush();
 
 	dumpbuf.vp_limit = dumpvp_size;
 	dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
 	dumpvp_write(dumphdr, sizeof (dumphdr_t));
+
+	if (dump_conflags & DUMP_ENCRYPT)
+		dumpvp_write(&dcrypt, sizeof (dump_crypt_t));
+
 	dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
 	dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
 
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index f51e2c5ca1..62d1e298dd 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -24,9 +24,9 @@
  */
 
 /*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	All Rights Reserved	*/
 /*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
 #endif
 
 #define	PSUIDFLAGS		(SNOCD|SUGID)
+#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */
 
 /*
  * These are consumed within the specific exec modules, but are defined here
@@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
 	struct user *up = PTOU(p);
-	long execsz;		/* temporary count of exec size */
+	size_t execsz;		/* temporary count of exec size */
 	int i;
 	int error;
 	char exec_file[MAXCOMLEN+1];
@@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * only if the pathname does not contain a "/" the resolved path
 	 * points to a file in the current working (attribute) directory.
 	 */
-	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
+	mutex_enter(&p->p_lock);
+	if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&
 	    strchr(resolvepn.pn_path, '/') == NULL) {
+		mutex_exit(&p->p_lock);
 		if (dir != NULL)
 			VN_RELE(dir);
 		error = EACCES;
@@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 		VN_RELE(vp);
 		goto out;
 	}
+	mutex_exit(&p->p_lock);
 
 	bzero(exec_file, MAXCOMLEN+1);
 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
@@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	ua.argp = argp;
 	ua.envp = envp;
 
-	/* If necessary, brand this process before we start the exec. */
-	if (brandme)
-		brand_setbrand(p);
+	/* If necessary, brand this process/lwp before we start the exec. */
+	if (brandme) {
+		void *brand_data = NULL;
+
+		/*
+		 * Process branding may fail if multiple LWPs are present and
+		 * holdlwps() cannot complete successfully.
+		 */
+		error = brand_setbrand(p, B_TRUE);
+
+		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+			brand_data = BROP(p)->b_lwpdata_alloc(p);
+			if (brand_data == NULL) {
+				error = 1;
+			}
+		}
+
+		if (error == 0) {
+			mutex_enter(&p->p_lock);
+			BROP(p)->b_initlwp(lwp, brand_data);
+			mutex_exit(&p->p_lock);
+		} else {
+			VN_RELE(vp);
+			if (dir != NULL) {
+				VN_RELE(dir);
+			}
+			pn_free(&resolvepn);
+			goto fail;
+		}
+	}
 
 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
-	    exec_file, p->p_cred, brand_action)) != 0) {
-		if (brandme)
-			brand_clearbrand(p, B_FALSE);
+	    exec_file, p->p_cred, &brand_action)) != 0) {
+		if (brandme) {
+			BROP(p)->b_freelwp(lwp);
+			brand_clearbrand(p, B_TRUE);
+		}
 		VN_RELE(vp);
 		if (dir != NULL)
 			VN_RELE(dir);
@@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	/*
 	 * Clear contract template state
 	 */
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_TRUE);
 
 	/*
 	 * Save the directory in which we found the executable for expanding
@@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * pending held signals remain held, so don't clear t_hold.
 	 */
 	mutex_enter(&p->p_lock);
+	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
 	lwp->lwp_oldcontext = 0;
 	lwp->lwp_ustack = 0;
 	lwp->lwp_old_stk_ctl = 0;
@@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 
 	/* Unbrand ourself if necessary. */
-	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+		BROP(p)->b_freelwp(lwp);
 		brand_clearbrand(p, B_FALSE);
+	}
 
 	setregs(&args);
 
@@ -566,10 +603,10 @@ gexec(
 	struct uarg *args,
 	struct intpdata *idatap,
 	int level,
-	long *execsz,
+	size_t *execsz,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
 	struct vnode *vp, *execvp = NULL;
 	proc_t *pp = ttoproc(curthread);
@@ -890,8 +927,14 @@ gexec(
 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 				args->traceinval = 1;
 		}
-		if (pp->p_proc_flag & P_PR_PTRACE)
+
+		/*
+		 * If legacy ptrace is enabled, generate the SIGTRAP.
+		 */
+		if (pp->p_proc_flag & P_PR_PTRACE) {
 			psignal(pp, SIGTRAP);
+		}
+
 		if (args->traceinval)
 			prinvalidate(&pp->p_user);
 	}
@@ -1448,7 +1491,7 @@ noexec(
     struct uarg *args,
     struct intpdata *idatap,
     int level,
-    long *execsz,
+    size_t *execsz,
     int setid,
     caddr_t exec_file,
     struct cred *cred)
@@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
 	return (0);
 }
 
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+	int error;
+
+	if (STK_AVAIL(args) < sizeof (int))
+		return (E2BIG);
+	*--args->stk_offp = args->stk_strp - args->stk_base;
+
+	if (len > STK_AVAIL(args))
+		return (E2BIG);
+	bcopy(sp, args->stk_strp, len);
+
+	args->stk_strp += len;
+
+	return (0);
+}
+
 static int
 stk_getptr(uarg_t *args, char *src, char **dst)
 {
@@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	size_t size, pad;
 	char *argv = (char *)uap->argp;
 	char *envp = (char *)uap->envp;
+	uint8_t rdata[RANDOM_LEN];
 
 	/*
 	 * Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1647,7 +1712,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		}
 	}
 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
-	args->arglen = args->stk_strp - args->stk_base;
+	args->argstrlen = args->stk_strp - args->stk_base;
+
+	const char *envstr = args->stk_strp;
 
 	/*
 	 * Add environ[] strings to the stack.
@@ -1669,12 +1736,15 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 			envp += ptrsize;
 		}
 	}
+
+	args->envstrlen = args->stk_strp - envstr;
 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
 	args->ne = args->na - argc;
 
 	/*
-	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
-	 * AT_SUN_EMULATOR strings to the stack.
+	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+	 * array, to the stack.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1687,6 +1757,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		if (args->emulator != NULL &&
 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
 			return (error);
+
+		/*
+		 * For the AT_RANDOM aux vector we provide 16 bytes of random
+		 * data.
+		 */
+		(void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+			return (error);
+
+		if (args->brand_nroot != NULL &&
+		    (error = stk_add(args, args->brand_nroot,
+		    UIO_SYSSPACE)) != 0)
+			return (error);
 	}
 
 	/*
@@ -1743,46 +1827,53 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 	 */
 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
 		return (-1);
+	usp += ptrsize;
 
 	/*
-	 * Add argc space (ptrsize) to usp and record argv for /proc.
+	 * For the benefit of /proc, record the user address of the argv[] array
+	 * as well as the start of the argv string space (argv[0]).
 	 */
-	up->u_argv = (uintptr_t)(usp += ptrsize);
+	up->u_argv = (uintptr_t)usp;
+	up->u_argvstrs = (uintptr_t)(&ustrp[*(offp - 1)]);
+	up->u_argvstrsize = args->argstrlen;
 
 	/*
-	 * Put the argv[] pointers on the stack.
+	 * Put the argv[] pointers on the stack, including a NULL terminator.
 	 */
 	for (i = 0; i < argc; i++, usp += ptrsize)
 		if (stk_putptr(args, usp, &ustrp[*--offp]))
 			return (-1);
+	usp += ptrsize;
 
 	/*
 	 * Copy arguments to u_psargs.
 	 */
-	pslen = MIN(args->arglen, PSARGSZ) - 1;
+	pslen = MIN(args->argstrlen, PSARGSZ) - 1;
 	for (i = 0; i < pslen; i++)
 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
 	while (i < PSARGSZ)
 		up->u_psargs[i++] = '\0';
 
 	/*
-	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
-	 * record envp for /proc.
+	 * For the benefit of /proc, record the user address of the envp[] array
+	 * as well as the start of the envp string space (envp[0]).
 	 */
-	up->u_envp = (uintptr_t)(usp += ptrsize);
+	up->u_envp = (uintptr_t)usp;
+	up->u_envstrs = (uintptr_t)(&ustrp[*(offp - 1)]);
+	up->u_envstrsize = args->envstrlen;
 
 	/*
-	 * Put the envp[] pointers on the stack.
+	 * Put the envp[] pointers on the stack, including a NULL terminator.
 	 */
 	for (i = 0; i < envc; i++, usp += ptrsize)
 		if (stk_putptr(args, usp, &ustrp[*--offp]))
 			return (-1);
+	usp += ptrsize;
 
 	/*
-	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
-	 * remember where the stack ends, which is also where auxv begins.
+	 * Remember where the stack ends, which is also where auxv begins.
 	 */
-	args->stackend = usp += ptrsize;
+	args->stackend = usp;
 
 	/*
 	 * Put all the argv[], envp[], and auxv strings on the stack.
@@ -1793,7 +1884,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 	/*
 	 * Fill in the aux vector now that we know the user stack addresses
 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
-	 * AT_SUN_EMULATOR strings.
+	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if (args->to_model == DATAMODEL_NATIVE) {
@@ -1806,6 +1897,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a,
 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a,
+				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+			}
 		} else {
 			auxv32_t **a = (auxv32_t **)auxvpp;
 			ADDAUX(*a,
@@ -1818,6 +1914,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a, AT_SUN_EMULATOR,
 				    (int)(uintptr_t)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a, AT_SUN_BRAND_NROOT,
+				    (int)(uintptr_t)&ustrp[*--offp])
+			}
 		}
 	}
 
@@ -1961,6 +2062,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		usrstack = (char *)USRSTACK32;
 	}
 
+	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+		usrstack = (char *)args->maxstack;
+
 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
 
 #if defined(__sparc)
@@ -2056,7 +2160,7 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	delete_itimer_realprof();
 
 	if (AU_AUDITING())
-		audit_exec(args->stk_base, args->stk_base + args->arglen,
+		audit_exec(args->stk_base, args->stk_base + args->argstrlen,
 		    args->na - args->ne, args->ne, args->pfcred);
 
 	/*
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 5a9355ae9f..7ccf9b3221 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -141,11 +141,32 @@ rexit(int rval)
 }
 
 /*
+ * Bump the init_restarts kstat and let interested parties know about the
+ * restart.
+ */
+static void
+restart_init_notify(zone_t *zone)
+{
+	nvlist_t *nvl = NULL;
+
+	zone->zone_proc_init_restarts++;
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 &&
+	    nvlist_add_uint32(nvl, ZONE_CB_RESTARTS,
+	    zone->zone_proc_init_restarts) == 0) {
+		zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS,
+		    ZONE_EVENT_INIT_RESTART_SC, nvl);
+	}
+
+	nvlist_free(nvl);
+}
+
+/*
  * Called by proc_exit() when a zone's init exits, presumably because
  * it failed.  As long as the given zone is still in the "running"
  * state, we will re-exec() init, but first we need to reset things
  * which are usually inherited across exec() but will break init's
- * assumption that it is being exec()'d from a virgin process.  Most
+ * assumption that it is being exec()'d from a virgin process.	Most
  * importantly this includes closing all file descriptors (exec only
  * closes those marked close-on-exec) and resetting signals (exec only
  * resets handled signals, and we need to clear any signals which
@@ -234,7 +255,7 @@ restart_init(int what, int why)
 		siginfofree(lwp->lwp_curinfo);
 		lwp->lwp_curinfo = NULL;
 	}
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_FALSE);
 
 	/*
 	 * Reset both the process root directory and the current working
@@ -286,6 +307,8 @@ restart_init(int what, int why)
 	ASSERT(p == curproc);
 	(void) freectty(B_TRUE);
 
+	restart_init_notify(p->p_zone);
+
 	/*
 	 * Now exec() the new init(8) on top of the current process.  If we
 	 * succeed, the caller will treat this like a successful system call.
@@ -320,7 +343,7 @@ exit(int why, int what)
 	/*
 	 * If proc_exit() fails, then some other lwp in the process
 	 * got there first.  We just have to call lwp_exit() to allow
-	 * the other lwp to finish exiting the process.  Otherwise we're
+	 * the other lwp to finish exiting the process.	 Otherwise we're
 	 * restarting init, and should return.
 	 */
 	if (proc_exit(why, what) != 0) {
@@ -333,7 +356,7 @@ exit(int why, int what)
 
 /*
  * Set the SEXITING flag on the process, after making sure /proc does
- * not have it locked.  This is done in more places than proc_exit(),
+ * not have it locked.	This is done in more places than proc_exit(),
  * so it is a separate function.
  */
 void
@@ -380,8 +403,9 @@ zone_init_exit(zone_t *z, int why, int what)
 	 */
 	if (!z->zone_restart_init) {
 		/*
-		 * The zone has been set up to halt when init exits.
+		 * The zone has been setup to halt when init exits.
 		 */
+		z->zone_init_status = wstat(why, what);
 		(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
 		z->zone_proc_initpid = -1;
 		return (B_FALSE);
@@ -421,6 +445,7 @@ zone_init_exit(zone_t *z, int why, int what)
 			(void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred());
 		}
 
+		z->zone_init_status = wstat(why, what);
 		z->zone_proc_initpid = -1;
 		return (B_FALSE);
 	}
@@ -441,14 +466,16 @@ zone_init_exit(zone_t *z, int why, int what)
 		/*
 		 * No restart modifiers on the zone, attempt to restart init.
 		 */
-		if (restart_init(what, why) == 0)
+		if (restart_init(what, why) == 0) {
 			return (B_TRUE);
+		}
 	}
 
 	/*
 	 * The restart failed, or the criteria for a restart are not met;
 	 * the zone will shut down.
 	 */
+	z->zone_init_status = wstat(why, what);
 	(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
 	z->zone_proc_initpid = -1;
 	return (B_FALSE);
@@ -483,7 +510,7 @@ proc_exit(int why, int what)
 
 	/*
 	 * Stop and discard the process's lwps except for the current one,
-	 * unless some other lwp beat us to it.  If exitlwps() fails then
+	 * unless some other lwp beat us to it.	 If exitlwps() fails then
 	 * return and the calling lwp will call (or continue in) lwp_exit().
 	 */
 	proc_is_exiting(p);
@@ -501,19 +528,6 @@ proc_exit(int why, int what)
 	}
 	mutex_exit(&p->p_lock);
 
-	DTRACE_PROC(lwp__exit);
-	DTRACE_PROC1(exit, int, why);
-
-	/*
-	 * Will perform any brand specific proc exit processing, since this
-	 * is always the last lwp, will also perform lwp_exit and free brand
-	 * data
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		lwp_detach_brand_hdlrs(lwp);
-		brand_clearbrand(p, B_FALSE);
-	}
-
 	/*
 	 * Don't let init exit unless zone_start_init() failed its exec, or
 	 * we are shutting down the zone or the machine.
@@ -527,6 +541,32 @@ proc_exit(int why, int what)
 			return (0);
 	}
 
+	/*
+	 * Delay firing probes (and performing brand cleanup) until after the
+	 * zone_proc_initpid check. Cases which result in zone shutdown or
+	 * restart via zone_kadmin eventually result in a call back to
+	 * proc_exit.
+	 */
+	DTRACE_PROC(lwp__exit);
+	DTRACE_PROC1(exit, int, why);
+
+	/*
+	 * Will perform any brand specific proc exit processing. Since this
+	 * is always the last lwp, will also perform lwp exit/free and proc
+	 * exit. Brand data will be freed when the process is reaped.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_proc_exit(p);
+		/*
+		 * To ensure that b_proc_exit has access to brand-specific data
+		 * contained by the one remaining lwp, call the freelwp hook as
+		 * the last part of this clean-up process.
+		 */
+		BROP(p)->b_freelwp(lwp);
+		lwp_detach_brand_hdlrs(lwp);
+	}
+
 	lwp_pcb_exit();
 
 	/*
@@ -693,7 +733,7 @@ proc_exit(int why, int what)
 		semexit(p);
 	rv = wstat(why, what);
 
-	acct(rv & 0xff);
+	acct(rv);
 	exacct_commit_proc(p, rv);
 
 	/*
@@ -786,10 +826,22 @@ proc_exit(int why, int what)
 	if ((q = p->p_child) != NULL && p != proc_init) {
 		struct proc	*np;
 		struct proc	*initp = proc_init;
+		pid_t		zone_initpid = 1;
+		struct proc	*zoneinitp = NULL;
 		boolean_t	setzonetop = B_FALSE;
 
-		if (!INGLOBALZONE(curproc))
-			setzonetop = B_TRUE;
+		if (!INGLOBALZONE(curproc)) {
+			zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+			ASSERT(MUTEX_HELD(&pidlock));
+			zoneinitp = prfind(zone_initpid);
+			if (zoneinitp != NULL) {
+				initp = zoneinitp;
+			} else {
+				zone_initpid = 1;
+				setzonetop = B_TRUE;
+			}
+		}
 
 		pgdetach(p);
 
@@ -801,7 +853,8 @@ proc_exit(int why, int what)
 			 */
 			delete_ns(q->p_parent, q);
 
-			q->p_ppid = 1;
+			q->p_ppid = zone_initpid;
+
 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 			if (setzonetop) {
 				mutex_enter(&q->p_lock);
@@ -959,7 +1012,7 @@ proc_exit(int why, int what)
 	 * curthread's proc pointer is changed to point to the 'sched'
 	 * process for the corresponding zone, except in the case when
 	 * the exiting process is in fact a zsched instance, in which
-	 * case the proc pointer is set to p0.  We do so, so that the
+	 * case the proc pointer is set to p0.	We do so, so that the
 	 * process still points at the right zone when we call the VN_RELE()
 	 * below.
 	 *
@@ -975,8 +1028,50 @@ proc_exit(int why, int what)
 
 	mutex_exit(&p->p_lock);
 	if (!evaporate) {
-		p->p_pidflag &= ~CLDPEND;
-		sigcld(p, sqp);
+		/*
+		 * The brand specific code only happens when the brand has a
+		 * function to call in place of sigcld and the parent of the
+		 * exiting process is not the global zone init. If the parent
+		 * is the global zone init, then the process was reparented,
+		 * and we don't want brand code delivering possibly strange
+		 * signals to init. Also, init is not branded, so any brand
+		 * specific exit data will not be picked up by init anyway.
+		 */
+		if (PROC_IS_BRANDED(p) &&
+		    BROP(p)->b_exit_with_sig != NULL &&
+		    p->p_ppid != 1) {
+			/*
+			 * The code for _fini that could unload the brand_t
+			 * blocks until the count of zones using the module
+			 * reaches zero. Zones decrement the refcount on their
+			 * brands only after all user tasks in that zone have
+			 * exited and been waited on. The decrement on the
+			 * brand's refcount happen in zone_destroy(). That
+			 * depends on zone_shutdown() having been completed.
+			 * zone_shutdown() includes a call to zone_empty(),
+			 * where the zone waits for itself to reach the state
+			 * ZONE_IS_EMPTY. This state is only set in either
+			 * zone_shutdown(), when there are no user processes as
+			 * the zone enters this function, or in
+			 * zone_task_rele(). zone_task_rele() is called from
+			 * code triggered by waiting on processes, not by the
+			 * processes exiting through proc_exit().  This means
+			 * all the branded processes that could exist for a
+			 * specific brand_t must exit and get reaped before the
+			 * refcount on the brand_t can reach 0. _fini will
+			 * never unload the corresponding brand module before
+			 * proc_exit finishes execution for all processes
+			 * branded with a particular brand_t, which makes the
+			 * operation below safe to do. Brands that wish to use
+			 * this mechanism must wait in _fini as described
+			 * above.
+			 */
+			BROP(p)->b_exit_with_sig(p, sqp);
+		} else {
+			p->p_pidflag &= ~CLDPEND;
+			sigcld(p, sqp);
+		}
+
 	} else {
 		/*
 		 * Do what sigcld() would do if the disposition
@@ -1001,7 +1096,7 @@ proc_exit(int why, int what)
 	/*
 	 * task_rele() may ultimately cause the zone to go away (or
 	 * may cause the last user process in a zone to go away, which
-	 * signals zsched to go away).  So prior to this call, we must
+	 * signals zsched to go away).	So prior to this call, we must
 	 * no longer point at zsched.
 	 */
 	t->t_procp = &p0;
@@ -1055,10 +1150,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 int
 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 {
-	int found;
 	proc_t *cp, *pp;
-	int proc_gone;
 	int waitflag = !(options & WNOWAIT);
+	boolean_t have_brand_helper = B_FALSE;
 
 	/*
 	 * Obsolete flag, defined here only for binary compatibility
@@ -1086,7 +1180,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 	pp = ttoproc(curthread);
 
 	/*
-	 * lock parent mutex so that sibling chain can be searched.
+	 * Anytime you are looking for a process, you take pidlock to prevent
+	 * things from changing as you look.
 	 */
 	mutex_enter(&pidlock);
 
@@ -1106,10 +1201,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		return (ECHILD);
 	}
 
-	while (pp->p_child != NULL) {
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+		have_brand_helper = B_TRUE;
+	}
+
+	while (pp->p_child != NULL || have_brand_helper) {
+		boolean_t brand_wants_wait = B_FALSE;
+		int proc_gone = 0;
+		int found = 0;
 
-		proc_gone = 0;
+		/*
+		 * Give the brand a chance to return synthetic results from
+		 * this waitid() call before we do the real thing.
+		 */
+		if (have_brand_helper) {
+			int ret;
 
+			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+			    &brand_wants_wait, &ret) == 0) {
+				mutex_exit(&pidlock);
+				return (ret);
+			}
+
+			if (pp->p_child == NULL) {
+				goto no_real_children;
+			}
+		}
+
+		/*
+		 * Look for interesting children in the newstate list.
+		 */
+		VERIFY(pp->p_child != NULL);
 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 				continue;
@@ -1117,6 +1239,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 
@@ -1161,12 +1288,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * Wow! None of the threads on the p_sibling_ns list were
 		 * interesting threads. Check all the kids!
 		 */
-		found = 0;
 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
 			if (idtype == P_PID && id != cp->p_pid)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 			case CLD_TRAPPED:
@@ -1235,11 +1366,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				break;
 		}
 
+no_real_children:
 		/*
 		 * If we found no interesting processes at all,
 		 * break out and return ECHILD.
 		 */
-		if (found + proc_gone == 0)
+		if (!brand_wants_wait && (found + proc_gone == 0))
 			break;
 
 		if (options & WNOHANG) {
@@ -1258,7 +1390,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * change state while we wait, we don't wait at all.
 		 * Get out with ECHILD according to SVID.
 		 */
-		if (found == proc_gone)
+		if (!brand_wants_wait && (found == proc_gone))
 			break;
 
 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1354,6 +1486,12 @@ freeproc(proc_t *p)
 		p->p_killsqp = NULL;
 	}
 
+	/* Clear any remaining brand data */
+	if (PROC_IS_BRANDED(p)) {
+		brand_clearbrand(p, B_FALSE);
+	}
+
+
 	prfree(p);	/* inform /proc */
 
 	/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index c25564d85f..f6179cf301 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,7 +21,8 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc.
+ * Copyright 2017, Joyent Inc.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -488,7 +489,7 @@ free_afd(afd_t *afd)		/* called below and from thread_free() */
 		afd->a_fd[i] = -1;
 }
 
-static void
+void
 set_active_fd(int fd)
 {
 	afd_t *afd = &curthread->t_activefd;
@@ -958,7 +959,22 @@ closef(file_t *fp)
 
 	vp = fp->f_vnode;
 
-	error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
+	/*
+	 * The __FLXPATH flag is a private interface for use by the lx
+	 * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
+	 * when a symbolic link is encountered, returns a file
+	 * descriptor which references it.
+	 * See uts/common/brand/lx/syscall/lx_open.c
+	 *
+	 * When this flag is set, VOP_OPEN() will not have been called when
+	 * this file descriptor was opened, and VOP_CLOSE() should not be
+	 * called here (for a symlink, most filesystems would return ENOSYS
+	 * anyway)
+	 */
+	if (fp->f_flag2 & (__FLXPATH >> 16))
+		error = 0;
+	else
+		error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
 
 	if (count > 1) {
 		mutex_exit(&fp->f_tlock);
@@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp)
 	mutex_enter(&fp->f_tlock);
 	fp->f_count = 1;
 	fp->f_flag = (ushort_t)flag;
-	fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16;
+	fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16;
 	fp->f_vnode = vp;
 	fp->f_offset = 0;
 	fp->f_audit_data = 0;
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index 1caa0b9b7b..183e1f4333 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
 static int getproc(proc_t **, pid_t, uint_t);
 #define	GETPROC_USER	0x0
 #define	GETPROC_KERNEL	0x1
+#define	GETPROC_ZSCHED	0x2
 
 static void fork_fail(proc_t *);
 static void forklwp_fail(proc_t *);
@@ -706,7 +707,7 @@ fork_fail(proc_t *cp)
 	if (PTOU(curproc)->u_cwd)
 		refstr_rele(PTOU(curproc)->u_cwd);
 	if (PROC_IS_BRANDED(cp)) {
-		brand_clearbrand(cp, B_TRUE);
+		brand_clearbrand(cp, B_FALSE);
 	}
 }
 
@@ -755,7 +756,7 @@ forklwp_fail(proc_t *p)
 			kmem_free(t->t_door, sizeof (door_data_t));
 			t->t_door = NULL;
 		}
-		lwp_ctmpl_clear(ttolwp(t));
+		lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 
 		/*
 		 * Remove the thread from the all threads list.
@@ -792,6 +793,9 @@ extern struct as kas;
 
 /*
  * fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
  */
 int
 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -810,6 +814,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 
 		ASSERT(pid != 1);
+		ASSERT(pid >= 0);
 
 		if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 			return (EAGAIN);
@@ -853,8 +858,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 		task_t *tk, *tk_old;
 		klwp_t *lwp;
+		boolean_t pzsched = B_FALSE;
+		int flag = GETPROC_USER;
+
+		/* Handle a new user-level thread as child of zsched. */
+		if (pid < 0) {
+			VERIFY(curzone != global_zone);
+			flag = GETPROC_ZSCHED;
+			pzsched = B_TRUE;
+			pid = 0;
+		}
 
-		if (getproc(&p, pid, GETPROC_USER) < 0)
+		if (getproc(&p, pid, flag) < 0)
 			return (EAGAIN);
 		/*
 		 * init creates a new task, distinct from the task
@@ -915,7 +930,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		}
 		t = lwptot(lwp);
 
-		ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+		ctp = contract_process_fork(sys_process_tmpl, p,
+		    (pzsched ? curproc->p_zone->zone_zsched : curproc),
 		    B_FALSE);
 		ASSERT(ctp != NULL);
 		if (ct != NULL)
@@ -956,7 +972,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 		return (-1);	/* no point in starting new processes */
 
-	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	if (flags & GETPROC_ZSCHED) {
+		pp = curproc->p_zone->zone_zsched;
+	} else {
+		pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	}
 	task = pp->p_task;
 	proj = task->tk_proj;
 	zone = pp->p_zone;
@@ -1017,6 +1037,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_t1_lgrpid = LGRP_NONE;
 	cp->p_tr_lgrpid = LGRP_NONE;
 
+	/* Default to native brand initially */
+	cp->p_brand = &native_brand;
+
 	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 		if (nproc == v.v_proc) {
 			CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1084,9 +1107,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
 	cp->p_sessp = pp->p_sessp;
 	sess_hold(pp);
-	cp->p_brand = pp->p_brand;
-	if (PROC_IS_BRANDED(pp))
-		BROP(pp)->b_copy_procdata(cp, pp);
 	cp->p_bssbase = pp->p_bssbase;
 	cp->p_brkbase = pp->p_brkbase;
 	cp->p_brksize = pp->p_brksize;
@@ -1171,6 +1191,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	mutex_exit(&cp->p_lock);
 	mutex_exit(&pidlock);
 
+	if (PROC_IS_BRANDED(pp)) {
+		/*
+		 * The only reason why process branding should fail is when
+		 * the procedure is complicated by multiple LWPs on the scene.
+		 * With an LWP count of 0, this newly allocated process has no
+		 * reason to fail branding.
+		 */
+		VERIFY0(brand_setbrand(cp, B_FALSE));
+
+		BROP(pp)->b_copy_procdata(cp, pp);
+	}
+
 	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
 	    offsetof(contract_t, ct_ctlist));
 
@@ -1188,6 +1220,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	 */
 	fcnt_add(P_FINFO(pp), 1);
 
+	mutex_enter(&pp->p_lock);
 	if (PTOU(pp)->u_cdir) {
 		VN_HOLD(PTOU(pp)->u_cdir);
 	} else {
@@ -1201,6 +1234,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 		VN_HOLD(PTOU(pp)->u_rdir);
 	if (PTOU(pp)->u_cwd)
 		refstr_hold(PTOU(pp)->u_cwd);
+	mutex_exit(&pp->p_lock);
 
 	/*
 	 * copy the parent's uarea.
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index da53bce24e..6e2d3c403c 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -55,6 +55,7 @@
 #include <sys/fcntl.h>
 #include <sys/lwpchan_impl.h>
 #include <sys/nbmlock.h>
+#include <sys/brand.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 	return (0);
 }
 
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+	if (flags & _MAP_LOW32) {
+		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+		} else {
+			return ((caddr_t)_userlimit32);
+		}
+	}
+
+	return (as->a_userlimit);
+}
+
 
 /*
  * Used for MAP_ANON - fast way to get anonymous pages
@@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		return (EACCES);
 
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
 
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(as->a_proc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
@@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 #define	RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
 	!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
 
-static int
+int
 smmap_common(caddr_t *addrp, size_t len,
     int prot, int flags, struct file *fp, offset_t pos)
 {
@@ -771,8 +783,6 @@ smmap_common(caddr_t *addrp, size_t len,
 	 * If the user specified an address, do some simple checks here
 	 */
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -780,10 +790,8 @@ smmap_common(caddr_t *addrp, size_t len,
 		 */
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
-
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(curproc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 86cb867da8..bf917ef716 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
 	    (IPC_ZONE_USAGE(perm, service) == 0)));
 }
 
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+	ASSERT(service->ipcs_count > 0);
+	ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+	ipc_remove(service, perm);
+	mutex_exit(&service->ipcs_lock);
+
+	/* perform any per-service removal actions */
+	service->ipcs_rmid(perm);
+
+	ipc_rele(service, perm);
+}
 
 /*
  * Common code to perform an IPC_RMID.  Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
 	/*
 	 * Nothing can fail from this point on.
 	 */
-	ipc_remove(service, perm);
-	mutex_exit(&service->ipcs_lock);
-
-	/* perform any per-service removal actions */
-	service->ipcs_rmid(perm);
-
-	ipc_rele(service, perm);
+	ipc_rmsvc(service, perm);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index 394235f26c..4d2c1e6c10 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2018, Joyent, Inc.
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 93c04cff8d..b09b2d3558 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -198,6 +198,9 @@ struct {
 	kstat_named_t pagesfree;
 	kstat_named_t pageslocked;
 	kstat_named_t pagestotal;
+	kstat_named_t lowmemscan;
+	kstat_named_t zonecapscan;
+	kstat_named_t nthrottle;
 } system_pages_kstat = {
 	{ "physmem",		KSTAT_DATA_ULONG },
 	{ "nalloc",		KSTAT_DATA_ULONG },
@@ -219,6 +222,9 @@ struct {
 	{ "pagesfree", 		KSTAT_DATA_ULONG },
 	{ "pageslocked", 	KSTAT_DATA_ULONG },
 	{ "pagestotal",		KSTAT_DATA_ULONG },
+	{ "low_mem_scan",	KSTAT_DATA_ULONG },
+	{ "zone_cap_scan",	KSTAT_DATA_ULONG },
+	{ "n_throttle",		KSTAT_DATA_ULONG },
 };
 
 static int header_kstat_update(kstat_t *, int);
@@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)
 	system_pages_kstat.pageslocked.value.ul	= (ulong_t)(availrmem_initial -
 	    availrmem);
 	system_pages_kstat.pagestotal.value.ul	= (ulong_t)total_pages;
+	system_pages_kstat.lowmemscan.value.ul	= (ulong_t)low_mem_scan;
+	system_pages_kstat.zonecapscan.value.ul	= (ulong_t)zone_cap_scan;
+	system_pages_kstat.nthrottle.value.ul	= (ulong_t)n_throttle;
 	/*
 	 * pp_kernel represents total pages used by the kernel since the
 	 * startup. This formula takes into account the boottime kernel
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index b5f41d93f9..6a922343e7 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -23,6 +23,8 @@
  * Copyright 2020 Oxide Computer Company
  * Copyright (c) 2013 Gary Mills
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2022 Joyent, Inc.
+ * Copyright 2022 MNX Cloud, Inc.
  */
 
 #include <sys/types.h>
@@ -260,8 +262,11 @@ log_init(void)
 #ifdef	LEGACY_BANNER
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
-	    "All rights reserved.\n");
+	/*
+	 * Note: In the future this should be 2022-20XX, and delete this
+	 * comment when we don't need it anymore
+	 */
+	printf("Copyright 2022 MNX Cloud, Inc.\n");
 #else
 	bootbanner_print(log_bootbanner_print, KM_SLEEP);
 #endif
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index b2adae570f..341e4ae356 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -57,6 +57,8 @@
 #include <sys/lgrp.h>
 #include <sys/rctl.h>
 #include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
 #include <sys/cpc_impl.h>
 #include <sys/sdt.h>
 #include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	ret_tidhash_t *ret_tidhash = NULL;
 	int i;
 	int rctlfail = 0;
-	boolean_t branded = 0;
+	void *brand_data = NULL;
 	struct ctxop *ctx = NULL;
 
 	ASSERT(cid != sysdccid);	/* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	 */
 	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 
+	/*
+	 * If necessary, speculatively allocate lwp brand data.  This is done
+	 * ahead of time so p_lock need not be dropped during lwp branding.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+			mutex_enter(&p->p_lock);
+			err = 1;
+			atomic_inc_32(&p->p_zone->zone_ffmisc);
+			goto error;
+		}
+	}
+
 	mutex_enter(&p->p_lock);
 grow:
 	/*
@@ -630,18 +645,6 @@ grow:
 		} while (lwp_hash_lookup(p, t->t_tid) != NULL);
 	}
 
-	/*
-	 * If this is a branded process, let the brand do any necessary lwp
-	 * initialization.
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		if (BROP(p)->b_initlwp(lwp)) {
-			err = 1;
-			atomic_inc_32(&p->p_zone->zone_ffmisc);
-			goto error;
-		}
-		branded = 1;
-	}
 
 	if (t->t_tid == 1) {
 		kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
 		}
 	}
 
-	p->p_lwpcnt++;
 	t->t_waitfor = -1;
 
 	/*
@@ -696,8 +698,27 @@ grow:
 	t->t_post_sys = 1;
 
 	/*
+	 * Perform lwp branding
+	 *
+	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+	 * continuously held between when the tidhash is sized and when the lwp
+	 * is inserted into it.  Operations requiring p->p_lock to be
+	 * temporarily dropped can be performed in b_initlwp_post.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_initlwp(lwp, brand_data);
+		/*
+		 * The b_initlwp hook is expected to consume any preallocated
+		 * brand_data in a way that prepares it for deallocation by the
+		 * b_freelwp hook.
+		 */
+		brand_data = NULL;
+	}
+
+	/*
 	 * Insert the new thread into the list of all threads.
 	 */
+	p->p_lwpcnt++;
 	if ((tx = p->p_tlist) == NULL) {
 		t->t_back = t;
 		t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
 	lep->le_start = t->t_start;
 	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
 
+	/*
+	 * Complete lwp branding
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+		BROP(p)->b_initlwp_post(lwp);
+	}
+
 	lwp_fp_init(lwp);
 
 	if (state == TS_RUN) {
@@ -755,8 +783,9 @@ error:
 		if (cid != NOCLASS && bufp != NULL)
 			CL_FREE(cid, bufp);
 
-		if (branded)
-			BROP(p)->b_freelwp(lwp);
+		if (brand_data != NULL) {
+			BROP(p)->b_lwpdata_free(brand_data);
+		}
 
 		mutex_exit(&p->p_lock);
 		t->t_state = TS_FREE;
@@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+		ct_template_t *tmpl = src->lwp_ct_active[i];
+
+		/*
+		 * If the process contract template is setup to be preserved
+		 * across exec, then if we're forking, perform an implicit
+		 * template_clear now. This ensures that future children of
+		 * this child will remain in the same contract unless they're
+		 * explicitly setup differently. We know we're forking if the
+		 * two LWPs belong to different processes.
+		 */
+		if (i == CTT_PROCESS && tmpl != NULL) {
+			ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+			if (dst->lwp_procp != src->lwp_procp &&
+			    (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+				tmpl = NULL;
+		}
+
+		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
 		dst->lwp_ct_latest[i] = NULL;
+
 	}
 }
 
@@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
  * Clear an LWP's contract template state.
  */
 void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
 {
 	ct_template_t *tmpl;
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
-			ctmpl_free(tmpl);
-			lwp->lwp_ct_active[i] = NULL;
-		}
-
 		if (lwp->lwp_ct_latest[i] != NULL) {
 			contract_rele(lwp->lwp_ct_latest[i]);
 			lwp->lwp_ct_latest[i] = NULL;
 		}
+
+		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+			/*
+			 * If we're exec-ing a new program and the process
+			 * contract template is setup to be preserved across
+			 * exec, then don't clear it.
+			 */
+			if (is_exec && i == CTT_PROCESS) {
+				ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+					continue;
+			}
+
+			ctmpl_free(tmpl);
+			lwp->lwp_ct_active[i] = NULL;
+		}
 	}
 }
 
@@ -893,13 +953,6 @@ lwp_exit(void)
 	if (t->t_upimutex != NULL)
 		upimutex_cleanup();
 
-	/*
-	 * Perform any brand specific exit processing, then release any
-	 * brand data associated with the lwp
-	 */
-	if (PROC_IS_BRANDED(p))
-		BROP(p)->b_lwpexit(lwp);
-
 	lwp_pcb_exit();
 
 	mutex_enter(&p->p_lock);
@@ -943,6 +996,18 @@ lwp_exit(void)
 	DTRACE_PROC(lwp__exit);
 
 	/*
+	 * Perform any brand specific exit processing, then release any
+	 * brand data associated with the lwp
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		mutex_exit(&p->p_lock);
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_freelwp(lwp);
+		mutex_enter(&p->p_lock);
+		prbarrier(p);
+	}
+
+	/*
 	 * If the lwp is a detached lwp or if the process is exiting,
 	 * remove (lwp_hash_out()) the lwp from the lwp directory.
 	 * Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1103,7 +1168,7 @@ lwp_cleanup(void)
 	}
 	kpreempt_enable();
 
-	lwp_ctmpl_clear(ttolwp(t));
+	lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 }
 
 int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 148916d4d8..c57f8a7d2c 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -159,7 +159,7 @@ exec_init(const char *initpath, const char *args)
 	int error = 0, count = 0;
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
-	int brand_action;
+	int brand_action = EBA_NONE;
 
 	if (args == NULL)
 		args = "";
@@ -289,7 +289,15 @@ exec_init(const char *initpath, const char *args)
 	 */
 	sigemptyset(&curthread->t_hold);
 
-	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+	/*
+	 * Only instruct exec_common to brand the process if necessary.  It is
+	 * possible that the init process is already properly branded due to the
+	 * proc_exit -> restart_init -> exec_init call chain.
+	 */
+	if (ZONE_IS_BRANDED(p->p_zone) &&
+	    p->p_brand != p->p_zone->zone_brand) {
+		brand_action = EBA_BRAND;
+	}
 again:
 	error = exec_common((const char *)exec_fnamep,
 	    (const char **)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 4c4e78578b..fd74dd3092 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp)
 		 * Put pressure on pageout.
 		 */
 		page_needfree(free_get);
-		cv_signal(&proc_pageout->p_cv);
+		WAKE_PAGEOUT_SCANNER();
 
 		mutex_enter(&mhp->mh_mutex);
 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index d85df39a62..819d32116d 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1367,10 +1367,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
 			}
 			if (num_segs++ == 0) {
 				/*
-				 * The p_vaddr of the first PT_LOAD segment
-				 * must either be NULL or within the first
-				 * page in order to be interpreted.
-				 * Otherwise, its an invalid file.
+				 * While ELF doesn't specify the meaning of
+				 * p_vaddr for PT_LOAD segments in ET_DYN
+				 * objects, we mandate that is either NULL or
+				 * (to accommodate some historical binaries)
+				 * within the first page.  (Note that there
+				 * exist non-native ET_DYN objects that violate
+				 * this constraint that we nonetheless must be
+				 * able to execute; see the ET_DYN handling in
+				 * mapelfexec() for details.)
 				 */
 				if (e_type == ET_DYN &&
 				    ((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index 37389a6e4d..d48be19c71 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -113,6 +113,18 @@ pid_lookup(pid_t pid)
 	return (pidp);
 }
 
+struct pid *
+pid_find(pid_t pid)
+{
+	struct pid *pidp;
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pid);
+	mutex_exit(&pidlinklock);
+
+	return (pidp);
+}
+
 void
 pid_setmin(void)
 {
@@ -521,6 +533,19 @@ sprunlock(proc_t *p)
 	mutex_exit(&p->p_lock);
 }
 
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+}
+
 void
 pid_init(void)
 {
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 0e4bd2c73d..b3f01cfab2 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -57,6 +57,7 @@
 #include <sys/mntent.h>
 #include <sys/contract_impl.h>
 #include <sys/dld_ioc.h>
+#include <sys/brand.h>
 
 /*
  * There are two possible layers of privilege routines and two possible
@@ -1275,6 +1276,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
 void
 secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
 {
+	proc_t *p = curproc;
+
+	/*
+	 * Allow the brand to override this behaviour.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+		/*
+		 * This brand hook will return 0 if handling is complete, or
+		 * some other value if the brand would like us to fall back to
+		 * the usual behaviour.
+		 */
+		if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+			return;
+		}
+	}
+
 	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
 	    secpolicy_vnode_setid_retain(cr,
 	    (vap->va_mode & S_ISUID) != 0 &&
@@ -2123,6 +2140,13 @@ secpolicy_meminfo(const cred_t *cr)
 }
 
 int
+secpolicy_fs_import(const cred_t *cr)
+{
+	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
 secpolicy_pfexec_register(const cred_t *cr)
 {
 	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2639,3 +2663,11 @@ secpolicy_ppp_config(const cred_t *cr)
 		return (secpolicy_net_config(cr, B_FALSE));
 	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
 }
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+		return (EPERM);
+	return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index 186aafc460..05979dd236 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
 	Allows a process to perform privileged mappings through a
 	graphics device.
 
+privilege PRIV_HYPRLOFS_CONTROL
+
+	Allows a process to manage hyprlofs entries.
+
 privilege PRIV_IPC_DAC_READ
 
 	Allows a process to read a System V IPC
@@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES
 	Allows a process to open the real console device directly.
 	Allows a process to open devices that have been exclusively opened.
 
+privilege PRIV_SYS_FS_IMPORT
+
+	Allows a process to import a potentially untrusted file system.
+
 privilege PRIV_SYS_IPC_CONFIG
 
 	Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 81a1b5454a..8f52f4ef3a 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 #include <sys/atomic.h>
@@ -194,6 +195,8 @@ id_space_t *rctl_ids;
 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
 
+extern rctl_hndl_t rc_process_maxlockedmem;
+
 kmutex_t rctl_lists_lock;
 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
 
@@ -2870,12 +2873,12 @@ rctl_init(void)
  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
  *     int chargeproc)
  *
- * Increments the amount of locked memory on a project, and
- * zone. If proj is non-NULL the project must be held by the
- * caller; if it is NULL the proj and zone of proc_t p are used.
- * If chargeproc is non-zero, then the charged amount is cached
- * on p->p_locked_mem so that the charge can be migrated when a
- * process changes projects.
+ * Increments the amount of locked memory on a process, project, and
+ * zone. If 'proj' is non-NULL, the project must be held by the
+ * caller; if it is NULL, the project and zone of process 'p' are used.
+ * If 'chargeproc' is non-zero, then the charged amount is added
+ * to p->p_locked_mem. This is also used so that the charge can be
+ * migrated when a process changes projects.
  *
  * Return values
  *    0 - success
@@ -2893,6 +2896,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 
 	ASSERT(p != NULL);
 	ASSERT(MUTEX_HELD(&p->p_lock));
+
 	if (proj != NULL) {
 		projp = proj;
 		zonep = proj->kpj_zone;
@@ -2936,11 +2940,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		}
 	}
 
-	zonep->zone_locked_mem += inc;
-	projp->kpj_data.kpd_locked_mem += inc;
 	if (chargeproc != 0) {
+		/* Check for overflow */
+		if ((p->p_locked_mem + inc) < p->p_locked_mem) {
+			ret = EAGAIN;
+			goto out;
+		}
+		if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p,
+		    &e, inc, 0) & RCT_DENY) {
+			ret = EAGAIN;
+			goto out;
+		}
+
 		p->p_locked_mem += inc;
 	}
+
+	zonep->zone_locked_mem += inc;
+	projp->kpj_data.kpd_locked_mem += inc;
 out:
 	mutex_exit(&zonep->zone_mem_lock);
 	return (ret);
diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c
index 9b7324fe7b..c62540d2b4 100644
--- a/usr/src/uts/common/os/rctl_proc.c
+++ b/usr/src/uts/common/os/rctl_proc.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -32,6 +33,7 @@
 #include <sys/port_kernel.h>
 #include <sys/signal.h>
 #include <sys/var.h>
+#include <sys/policy.h>
 
 #include <sys/vmparam.h>
 #include <sys/machparam.h>
@@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;
 rctl_hndl_t rc_process_semopm;
 rctl_hndl_t rc_process_portev;
 rctl_hndl_t rc_process_sigqueue;
+rctl_hndl_t rc_process_maxlockedmem;
 
 /*
  * process.max-cpu-time / RLIMIT_CPU
@@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {
 };
 
 /*
+ * process.max-locked-memory
+ */
+/*ARGSUSED*/
+static int
+proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
+    struct rctl_val *rv, rctl_qty_t i, uint_t f)
+{
+	if (secpolicy_lock_memory(CRED()) == 0)
+		return (0);
+	return ((p->p_locked_mem + i) > rv->rcv_value);
+}
+
+static rctl_ops_t proc_maxlockedmem_ops = {
+	rcop_no_action,
+	rcop_no_usage,
+	rcop_no_set,
+	proc_maxlockedmem_test
+};
+
+/*
  * void rctlproc_default_init()
  *
  * Overview
@@ -383,6 +406,11 @@ rctlproc_init(void)
 	rctl_add_default_limit("process.max-sigqueue-size",
 	    _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
 
+	rc_process_maxlockedmem = rctl_register("process.max-locked-memory",
+	    RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS |
+	    RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES,
+	    ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops);
+
 	/*
 	 * Place minimal set of controls on "sched" process for inheritance by
 	 * processes created via newproc().
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index bc6df6afba..6eb1194af3 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
@@ -649,16 +653,17 @@ top:
 		klwp_t *lwp = ttolwp(tp);
 
 		/*
-		 * Swapout eligible lwps (specified by the scheduling
-		 * class) which don't have TS_DONT_SWAP set.  Set the
-		 * "intent to swap" flag (TS_SWAPENQ) on threads
-		 * which have TS_DONT_SWAP set so that they can be
+		 * Swapout eligible lwps (specified by the scheduling class)
+		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap"
+		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+		 * set or are currently on a split stack so that they can be
 		 * swapped if and when they reach a safe point.
 		 */
 		thread_lock(tp);
 		thread_pri = CL_SWAPOUT(tp, swapflags);
 		if (thread_pri != -1) {
-			if (tp->t_schedflag & TS_DONT_SWAP) {
+			if ((tp->t_schedflag & TS_DONT_SWAP) ||
+			    (tp->t_flag & T_SPLITSTK)) {
 				tp->t_schedflag |= TS_SWAPENQ;
 				tp->t_trapret = 1;
 				aston(tp);
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index 8f98fcb3f0..d0611eb9bb 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 		size_t	share_size;
 		struct	shm_data ssd;
 		uintptr_t align_hint;
+		long	curprot;
 
 		/*
 		 * Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 		}
 
+		curprot = sp->shm_opts & SHM_PROT_MASK;
 		if (!isspt(sp)) {
 			error = sptcreate(size, &segspt, sp->shm_amp, prot,
 			    flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 			sp->shm_sptinfo->sptas = segspt->s_as;
 			sp->shm_sptseg = segspt;
-			sp->shm_sptprot = prot;
-		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+			sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+		} else if ((prot & curprot) != curprot) {
 			/*
 			 * Ensure we're attaching to an ISM segment with
 			 * fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
 		}
 		break;
 
+	/* Stage segment for removal, but don't remove until last detach */
+	case SHM_RMID:
+		if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+			break;
+
+		/*
+		 * If attached, just mark it as a pending remove, otherwise
+		 * we must perform the normal ipc_rmid now.
+		 */
+		if ((sp->shm_perm.ipc_ref - 1) > 0) {
+			sp->shm_opts |= SHM_RM_PENDING;
+		} else {
+			mutex_exit(lock);
+			return (ipc_rmid(shm_svc, shmid, cr));
+		}
+		break;
+
 	default:
 		error = EINVAL;
 		break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
 		sp->shm_ismattch--;
 	sp->shm_dtime = gethrestime_sec();
 	sp->shm_lpid = pp->p_pid;
+	if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+	    sp->shm_perm.ipc_ref == 2) {
+		/*
+		 * If this is the last detach of the segment across the whole
+		 * system then now we can perform the delayed IPC_RMID.
+		 * The ipc_ref count has 1 for the original 'get' and one for
+		 * each 'attach' (see 'stat' handling in shmctl).
+		 */
+		sp->shm_opts &= ~SHM_RM_PENDING;
+		mutex_enter(&shm_svc->ipcs_lock);
+		ipc_rmsvc(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
+		ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+		ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+		/* Lock was dropped, need to retake it for following rele. */
+		(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+	}
 	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
 
 	kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..67a93581dd 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/cyclic.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 #include <sys/signalfd.h>
 
 const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
 }
 
 /*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */
+	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */
+	    BROP(p)->b_sig_ignorable != NULL &&
+	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
  * Return true if the signal can safely be discarded on generation.
  * That is, if there is no need for the signal on the receiving end.
  * The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
  *	the signal is not being accepted via sigwait()
  */
 static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
 {
 	kthread_t *t = p->p_tlist;
+	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
 
 	return (t == NULL ||		/* if zombie or ... */
-	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */
+	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */
 	    t->t_forw == t &&			/* and single-threaded */
 	    !tracing(p, sig) &&			/* and no /proc tracing */
 	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
 		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
 			ttoproc(t)->p_stopsig = 0;
 			t->t_dtrace_stop = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 		} else if (t != curthread && t->t_state == TS_ONPROC) {
 			aston(t);	/* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
 		}
 	}
 
-	if (sig_discardable(p, sig)) {
+	if (sig_discardable(p, t, sig)) {
 		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
 		    proc_t *, p, int, sig);
 		return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
 			if (sigismember(&set, sig) &&
 			    (tracing(p, sig) ||
 			    sigismember(&t->t_sigwait, sig) ||
-			    !sigismember(&p->p_ignore, sig))) {
+			    !sig_ignorable(p, lwp, sig))) {
 				/*
 				 * Don't promote a signal that will stop
 				 * the process when lwp_nostop is set.
@@ -623,6 +640,28 @@ issig_forreal(void)
 		}
 
 		/*
+		 * The brand hook name 'b_issig_stop' is a misnomer.
+		 * Allow the brand the chance to alter (or suppress) delivery
+		 * of this signal.
+		 */
+		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+			int r;
+
+			/*
+			 * The brand hook will return 0 if it would like
+			 * us to drive on, -1 if we should restart
+			 * the loop to check other conditions, or 1 if we
+			 * should terminate the loop.
+			 */
+			r = BROP(p)->b_issig_stop(p, lwp);
+			if (r < 0) {
+				continue;
+			} else if (r > 0) {
+				break;
+			}
+		}
+
+		/*
 		 * Honor requested stop before dealing with the
 		 * current signal; a debugger may change it.
 		 * Do not want to go back to loop here since this is a special
@@ -656,7 +695,7 @@ issig_forreal(void)
 			lwp->lwp_cursig = 0;
 			lwp->lwp_extsig = 0;
 			if (sigismember(&t->t_sigwait, sig) ||
-			    (!sigismember(&p->p_ignore, sig) &&
+			    (!sig_ignorable(p, lwp, sig) &&
 			    !isjobstop(sig))) {
 				if (p->p_flag & (SEXITLWPS|SKILLED)) {
 					sig = SIGKILL;
@@ -708,7 +747,7 @@ issig_forreal(void)
 				toproc = 0;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&t->t_extsig, sig))
 						ext = 1;
 					break;
@@ -722,7 +761,7 @@ issig_forreal(void)
 				toproc = 1;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&p->p_extsig, sig))
 						ext = 1;
 					break;
@@ -954,6 +993,16 @@ stop(int why, int what)
 		}
 		break;
 
+	case PR_BRAND:
+		/*
+		 * We have been stopped by the brand code for a brand-private
+		 * reason.  This is an asynchronous stop affecting only this
+		 * LWP.
+		 */
+		VERIFY(PROC_IS_BRANDED(p));
+		flags &= ~TS_BSTART;
+		break;
+
 	default:	/* /proc stop */
 		flags &= ~TS_PSTART;
 		/*
@@ -1065,7 +1114,7 @@ stop(int why, int what)
 		}
 	}
 
-	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
 		/*
 		 * Do process-level notification when all lwps are
 		 * either stopped on events of interest to /proc
@@ -1171,6 +1220,13 @@ stop(int why, int what)
 	if (why == PR_CHECKPOINT)
 		del_one_utstop();
 
+	/*
+	 * Allow the brand to post notification of this stop condition.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+		BROP(p)->b_stop_notify(p, lwp, why, what);
+	}
+
 	thread_lock(t);
 	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
 	t->t_schedflag |= flags;
@@ -1192,7 +1248,7 @@ stop(int why, int what)
 		    (p->p_flag & (SEXITLWPS|SKILLED))) {
 			p->p_stopsig = 0;
 			thread_lock(t);
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 			thread_unlock_nopreempt(t);
 		} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1383,7 @@ psig(void)
 	 * this signal from pending to current (we dropped p->p_lock).
 	 * This can happen only in a multi-threaded process.
 	 */
-	if (sigismember(&p->p_ignore, sig) ||
+	if (sig_ignorable(p, lwp, sig) ||
 	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {
 		lwp->lwp_cursig = 0;
 		lwp->lwp_extsig = 0;
@@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
 			/*
 			 * This can only happen when the parent is init.
 			 * (See call to sigcld(q, NULL) in exit().)
-			 * Use KM_NOSLEEP to avoid deadlock.
+			 * Use KM_NOSLEEP to avoid deadlock. The child procs
+			 * initpid can be 1 for zlogin.
 			 */
-			ASSERT(pp == proc_init);
+			ASSERT(pp->p_pidp->pid_id ==
+			    cp->p_zone->zone_proc_initpid ||
+			    pp->p_pidp->pid_id == 1);
 			winfo(cp, &info, 0);
 			sigaddq(pp, NULL, &info, KM_NOSLEEP);
 		} else {
@@ -1804,6 +1863,15 @@ sigcld_repost()
 
 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 	mutex_enter(&pidlock);
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+		/*
+		 * Allow the brand to inject synthetic SIGCLD signals.
+		 */
+		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+			mutex_exit(&pidlock);
+			return;
+		}
+	}
 	for (cp = pp->p_child; cp; cp = cp->p_sibling) {
 		if (cp->p_pidflag & CLDPEND) {
 			post_sigcld(cp, sqp);
@@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(sig >= 1 && sig < NSIG);
 
-	if (sig_discardable(p, sig))
+	if (sig_discardable(p, t, sig))
 		siginfofree(sigqp);
 	else
 		sigaddqins(p, t, sigqp);
@@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
 	 * blocking the signal (it *could* change it's mind while
 	 * the signal is pending) then don't bother creating one.
 	 */
-	if (!sig_discardable(p, sig) &&
+	if (!sig_discardable(p, t, sig) &&
 	    (sigismember(&p->p_siginfo, sig) ||
 	    (curproc->p_ct_process != p->p_ct_process) ||
 	    (sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index c137a498d1..90a9ea6f0f 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -78,6 +78,7 @@
 #include <sys/policy.h>
 #include <sys/dld.h>
 #include <sys/zone.h>
+#include <sys/limits.h>
 #include <sys/ptms.h>
 #include <sys/limits.h>
 #include <c2/audit.h>
@@ -3267,6 +3268,7 @@ job_control_type(int cmd)
 	case JAGENT:	/* Obsolete */
 	case JTRUN:	/* Obsolete */
 	case JXTPROTO:	/* Obsolete */
+	case TIOCSETLD:
 		return (JCSETP);
 	}
 
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index fdd0c06aee..f2b91365d9 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -26,6 +26,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  * Copyright 2018 Joyent, Inc.
  * Copyright 2022 Garrett D'Amore
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index 30cc5744c2..7c094a0f20 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -5822,6 +5822,12 @@ ddi_ffs(long mask)
 	return (ffs(mask));
 }
 
+int
+ddi_ffsll(long long mask)
+{
+	return (ffs(mask));
+}
+
 /*
  * Find last bit set. Take mask and clear
  * all but the most significant bit, and
@@ -5833,8 +5839,14 @@ ddi_ffs(long mask)
 int
 ddi_fls(long mask)
 {
+	return (ddi_flsll(mask));
+}
+
+int
+ddi_flsll(long long mask)
+{
 	while (mask) {
-		long nx;
+		long long nx;
 
 		if ((nx = (mask & (mask - 1))) == 0)
 			break;
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index e87e6d8d29..dca168b642 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -23,6 +23,7 @@
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2012 Milan Jurik. All rights reserved.
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright 2020 Oxide Computer Company
  */
@@ -62,8 +63,7 @@ struct mmaplf32a;
 int	access(char *, int);
 int	alarm(int);
 int	auditsys(struct auditcalls *, rval_t *);
-int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
-    uintptr_t);
+int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 intptr_t	brk(caddr_t);
 int	chdir(char *);
 int	chmod(char *, int);
@@ -645,7 +645,7 @@ struct sysent sysent[NSYSCALL] =
 			SYSENT_NOSYS(),
 			SYSENT_C("llseek",	llseek32,	4)),
 	/* 176 */ SYSENT_LOADABLE(),		/* inst_sync */
-	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6),
+	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),
 	/* 178 */ SYSENT_LOADABLE(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
@@ -1000,7 +1000,7 @@ struct sysent sysent32[NSYSCALL] =
 	/* 174 */ SYSENT_CI("pwrite",		pwrite32,		4),
 	/* 175 */ SYSENT_C("llseek",		llseek32,	4),
 	/* 176 */ SYSENT_LOADABLE32(),		/* inst_sync */
-	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6),
+	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),
 	/* 178 */ SYSENT_LOADABLE32(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE32(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
@@ -1092,18 +1092,20 @@ char **syscallnames;
 
 systrace_sysent_t *systrace_sysent;
 void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 /*ARGSUSED*/
 void
 systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7)
 {}
 
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1111,7 +1113,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+		    arg6, arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1125,14 +1128,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+	    arg6, arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1144,7 +1148,8 @@ systrace_sysent_t *systrace_sysent32;
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1152,7 +1157,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+		    arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1166,14 +1172,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+	    arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1201,5 +1208,5 @@ dtrace_systrace_rtt(void)
 	}
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
-		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
 }
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index c78a545360..f587430625 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -82,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)
  * waiters.  p_lock must be held on entry; it will not be dropped by
  * timer_unlock().
  */
+/* ARGSUSED */
 static void
 timer_unlock(proc_t *p, itimer_t *it)
 {
@@ -139,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 
 	it->it_backend->clk_timer_delete(it);
 
-	if (it->it_portev) {
+	if (it->it_flags & IT_PORT) {
 		mutex_enter(&it->it_mutex);
 		if (it->it_portev) {
 			port_kevent_t	*pev;
@@ -201,20 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 static itimer_t *
 timer_grab(proc_t *p, timer_t tid)
 {
-	itimer_t **itp, *it;
+	itimer_t *it;
 
 	if (tid < 0) {
 		return (NULL);
 	}
 
 	mutex_enter(&p->p_lock);
-
-	if ((itp = p->p_itimer) == NULL || tid >= p->p_itimer_sz ||
-	    (it = itp[tid]) == NULL) {
+	if (p->p_itimer == NULL || tid >= p->p_itimer_sz ||
+	    (it = p->p_itimer[tid]) == NULL) {
 		mutex_exit(&p->p_lock);
 		return (NULL);
 	}
 
+	/* This may drop p_lock temporarily. */
 	timer_lock(p, it);
 
 	if (it->it_lock & ITLK_REMOVE) {
@@ -236,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
  * should not be held on entry; timer_release() will acquire p_lock but
  * will drop it before returning.
  */
-static void
+void
 timer_release(proc_t *p, itimer_t *it)
 {
 	mutex_enter(&p->p_lock);
@@ -249,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
  * p_lock should not be held on entry; timer_delete_grabbed() will acquire
  * p_lock, but will drop it before returning.
  */
-static void
+void
 timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
 {
 	mutex_enter(&p->p_lock);
@@ -464,6 +465,9 @@ timer_fire(itimer_t *it)
 			it->it_pending = 1;
 			port_send_event((port_kevent_t *)it->it_portev);
 			mutex_exit(&it->it_mutex);
+		} else if (it->it_flags & IT_CALLBACK) {
+			it->it_cb_func(it);
+			ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
 		} else if (it->it_flags & IT_SIGNAL) {
 			it->it_pending = 1;
 			mutex_exit(&it->it_mutex);
@@ -580,85 +584,27 @@ done:
 	return (B_TRUE);
 }
 
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend.  Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete().  This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
 int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+    itimer_t **itp, timer_t *tidp)
 {
-	struct sigevent ev;
 	proc_t *p = curproc;
-	clock_backend_t *backend;
+	int error = 0;
 	itimer_t *it;
 	sigqueue_t *sigq;
-	cred_t *cr = CRED();
-	int error = 0;
-	timer_t i;
-	port_notify_t tim_pnevp;
-	port_kevent_t *pkevp = NULL;
-
-	if ((backend = CLOCK_BACKEND(clock)) == NULL)
-		return (set_errno(EINVAL));
-
-	if (evp != NULL) {
-		/*
-		 * short copyin() for binary compatibility
-		 * fetch oldsigevent to determine how much to copy in.
-		 */
-		if (get_udatamodel() == DATAMODEL_NATIVE) {
-			if (copyin(evp, &ev, sizeof (struct oldsigevent)))
-				return (set_errno(EFAULT));
-
-			if (ev.sigev_notify == SIGEV_PORT ||
-			    ev.sigev_notify == SIGEV_THREAD) {
-				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
-				    sizeof (port_notify_t)))
-					return (set_errno(EFAULT));
-			}
-#ifdef	_SYSCALL32_IMPL
-		} else {
-			struct sigevent32 ev32;
-			port_notify32_t tim_pnevp32;
-
-			if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
-				return (set_errno(EFAULT));
-			ev.sigev_notify = ev32.sigev_notify;
-			ev.sigev_signo = ev32.sigev_signo;
-			/*
-			 * See comment in sigqueue32() on handling of 32-bit
-			 * sigvals in a 64-bit kernel.
-			 */
-			ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
-			if (ev.sigev_notify == SIGEV_PORT ||
-			    ev.sigev_notify == SIGEV_THREAD) {
-				if (copyin((void *)(uintptr_t)
-				    ev32.sigev_value.sival_ptr,
-				    (void *)&tim_pnevp32,
-				    sizeof (port_notify32_t)))
-					return (set_errno(EFAULT));
-				tim_pnevp.portnfy_port =
-				    tim_pnevp32.portnfy_port;
-				tim_pnevp.portnfy_user =
-				    (void *)(uintptr_t)tim_pnevp32.portnfy_user;
-			}
-#endif
-		}
-		switch (ev.sigev_notify) {
-		case SIGEV_NONE:
-			break;
-		case SIGEV_SIGNAL:
-			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
-				return (set_errno(EINVAL));
-			break;
-		case SIGEV_THREAD:
-		case SIGEV_PORT:
-			break;
-		default:
-			return (set_errno(EINVAL));
-		}
-	} else {
-		/*
-		 * Use the clock's default sigevent (this is a structure copy).
-		 */
-		ev = backend->clk_default;
-	}
+	timer_t tid;
 
 	/*
 	 * We'll allocate our sigqueue now, before we grab p_lock.
@@ -674,29 +620,25 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_enter(&p->p_lock);
-	if (!timer_get_id(p, &i)) {
+	if (!timer_get_id(p, &tid)) {
 		mutex_exit(&p->p_lock);
-		kmem_cache_free(clock_timer_cache, it);
 		kmem_free(sigq, sizeof (sigqueue_t));
 		return (set_errno(EAGAIN));
 	}
 
-	ASSERT(i < p->p_itimer_sz && p->p_itimer[i] == NULL);
+	ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
 
 	/*
 	 * If we develop other notification mechanisms, this will need
 	 * to call into (yet another) backend.
 	 */
-	sigq->sq_info.si_signo = ev.sigev_signo;
-	if (evp == NULL)
-		sigq->sq_info.si_value.sival_int = i;
-	else
-		sigq->sq_info.si_value = ev.sigev_value;
+	sigq->sq_info.si_signo = evp->sigev_signo;
+	sigq->sq_info.si_value = evp->sigev_value;
 	sigq->sq_info.si_code = SI_TIMER;
 	sigq->sq_info.si_pid = p->p_pid;
 	sigq->sq_info.si_ctid = PRCTID(p);
 	sigq->sq_info.si_zoneid = getzoneid();
-	sigq->sq_info.si_uid = crgetruid(cr);
+	sigq->sq_info.si_uid = crgetruid(CRED());
 	sigq->sq_func = timer_signal;
 	sigq->sq_next = NULL;
 	sigq->sq_backptr = it;
@@ -704,9 +646,12 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 	it->it_backend = backend;
 	it->it_lock = ITLK_LOCKED;
 
-	if (ev.sigev_notify == SIGEV_THREAD ||
-	    ev.sigev_notify == SIGEV_PORT) {
+	if (evp->sigev_notify == SIGEV_THREAD ||
+	    evp->sigev_notify == SIGEV_PORT) {
 		int port;
+		port_kevent_t *pkevp = NULL;
+
+		ASSERT(pnp != NULL);
 
 		/*
 		 * This timer is programmed to use event port notification when
@@ -726,7 +671,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 		 */
 
 		it->it_flags |= IT_PORT;
-		port = tim_pnevp.portnfy_port;
+		port = pnp->portnfy_port;
 
 		/* associate timer as event source with the port */
 		error = port_associate_ksource(port, PORT_SOURCE_TIMER,
@@ -736,7 +681,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 			mutex_exit(&p->p_lock);
 			kmem_cache_free(clock_timer_cache, it);
 			kmem_free(sigq, sizeof (sigqueue_t));
-			return (set_errno(error));
+			return (error);
 		}
 
 		/* allocate an event structure/slot */
@@ -748,21 +693,21 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 			mutex_exit(&p->p_lock);
 			kmem_cache_free(clock_timer_cache, it);
 			kmem_free(sigq, sizeof (sigqueue_t));
-			return (set_errno(error));
+			return (error);
 		}
 
 		/* initialize event data */
-		port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+		port_init_event(pkevp, tid, pnp->portnfy_user,
 		    timer_port_callback, it);
 		it->it_portev = pkevp;
 		it->it_portfd = port;
 	} else {
-		if (ev.sigev_notify == SIGEV_SIGNAL)
+		if (evp->sigev_notify == SIGEV_SIGNAL)
 			it->it_flags |= IT_SIGNAL;
 	}
 
 	/* Populate the slot now that the timer is prepped. */
-	p->p_itimer[i] = it;
+	p->p_itimer[tid] = it;
 	mutex_exit(&p->p_lock);
 
 	/*
@@ -775,17 +720,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 	it->it_lwp = ttolwp(curthread);
 	it->it_proc = p;
 
-	if (copyout(&i, tid, sizeof (timer_t)) != 0) {
-		error = EFAULT;
-		goto err;
-	}
-
-	/*
-	 * If we're here, then we have successfully created the timer; we
-	 * just need to release the timer and return.
-	 */
-	timer_release(p, it);
-
+	*itp = it;
+	*tidp = tid;
 	return (0);
 
 err:
@@ -796,11 +732,115 @@ err:
 	 * impossible for a removal to be pending.
 	 */
 	ASSERT(!(it->it_lock & ITLK_REMOVE));
-	timer_delete_grabbed(p, i, it);
+	timer_delete_grabbed(p, tid, it);
+
+	return (error);
+}
+
+
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+	int error = 0;
+	proc_t *p = curproc;
+	clock_backend_t *backend;
+	struct sigevent ev;
+	itimer_t *it;
+	timer_t tid;
+	port_notify_t tim_pnevp;
+
+	if ((backend = CLOCK_BACKEND(clock)) == NULL)
+		return (set_errno(EINVAL));
+
+	if (evp != NULL) {
+		/*
+		 * short copyin() for binary compatibility
+		 * fetch oldsigevent to determine how much to copy in.
+		 */
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+				return (set_errno(EFAULT));
+
+			if (ev.sigev_notify == SIGEV_PORT ||
+			    ev.sigev_notify == SIGEV_THREAD) {
+				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+				    sizeof (port_notify_t)))
+					return (set_errno(EFAULT));
+			}
+#ifdef	_SYSCALL32_IMPL
+		} else {
+			struct sigevent32 ev32;
+			port_notify32_t tim_pnevp32;
 
-	return (set_errno(error));
+			if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+				return (set_errno(EFAULT));
+			ev.sigev_notify = ev32.sigev_notify;
+			ev.sigev_signo = ev32.sigev_signo;
+			/*
+			 * See comment in sigqueue32() on handling of 32-bit
+			 * sigvals in a 64-bit kernel.
+			 */
+			ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+			if (ev.sigev_notify == SIGEV_PORT ||
+			    ev.sigev_notify == SIGEV_THREAD) {
+				if (copyin((void *)(uintptr_t)
+				    ev32.sigev_value.sival_ptr,
+				    (void *)&tim_pnevp32,
+				    sizeof (port_notify32_t)))
+					return (set_errno(EFAULT));
+				tim_pnevp.portnfy_port =
+				    tim_pnevp32.portnfy_port;
+				tim_pnevp.portnfy_user =
+				    (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+			}
+#endif
+		}
+		switch (ev.sigev_notify) {
+		case SIGEV_NONE:
+			break;
+		case SIGEV_SIGNAL:
+			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+				return (set_errno(EINVAL));
+			break;
+		case SIGEV_THREAD:
+		case SIGEV_PORT:
+			break;
+		default:
+			return (set_errno(EINVAL));
+		}
+	} else {
+		/*
+		 * Use the clock's default sigevent (this is a structure copy).
+		 */
+		ev = backend->clk_default;
+	}
+
+	if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+		return (set_errno(error));
+	}
+
+	/*
+	 * Populate si_value with the timer ID if no sigevent was passed in.
+	 */
+	if (evp == NULL) {
+		it->it_sigq->sq_info.si_value.sival_int = tid;
+	}
+
+	if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+		timer_delete_grabbed(p, tid, it);
+		return (set_errno(EFAULT));
+	}
+
+	/*
+	 * If we're here, then we have successfully created the timer; we
+	 * just need to release the timer and return.
+	 */
+	timer_release(p, it);
+
+	return (0);
 }
 
+
 int
 timer_gettime(timer_t tid, itimerspec_t *val)
 {
@@ -923,17 +963,20 @@ timer_lwpexit(void)
 	uint_t i;
 	proc_t *p = curproc;
 	klwp_t *lwp = ttolwp(curthread);
-	itimer_t *it, **itp;
+	itimer_t *it;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if ((itp = p->p_itimer) == NULL)
+	if (p->p_itimer == NULL) {
 		return;
+	}
 
 	for (i = 0; i < p->p_itimer_sz; i++) {
-		if ((it = itp[i]) == NULL)
+		if ((it = p->p_itimer[i]) == NULL) {
 			continue;
+		}
 
+		/* This may drop p_lock temporarily. */
 		timer_lock(p, it);
 
 		if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) {
@@ -967,17 +1010,19 @@ timer_lwpbind()
 	uint_t i;
 	proc_t *p = curproc;
 	klwp_t *lwp = ttolwp(curthread);
-	itimer_t *it, **itp;
+	itimer_t *it;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if ((itp = p->p_itimer) == NULL)
+	if (p->p_itimer == NULL) {
 		return;
+	}
 
 	for (i = 0; i < p->p_itimer_sz; i++) {
-		if ((it = itp[i]) == NULL)
+		if ((it = p->p_itimer[i]) == NULL)
 			continue;
 
+		/* This may drop p_lock temporarily. */
 		timer_lock(p, it);
 
 		if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) {
@@ -1068,7 +1113,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
 	for (tid = 0; tid < timer_max; tid++) {
 		if ((it = timer_grab(p, tid)) == NULL)
 			continue;
-		if (it->it_portev) {
+		if (it->it_flags & IT_PORT) {
 			mutex_enter(&it->it_mutex);
 			if (it->it_portfd == port) {
 				port_kevent_t *pev;
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index c3fd3658d6..1df2f479a5 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -27,6 +27,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -64,6 +65,7 @@
 #include <sys/tnf_probe.h>
 #include <sys/mem_cage.h>
 #include <sys/time.h>
+#include <sys/zone.h>
 #include <sys/stdbool.h>
 
 #include <vm/hat.h>
@@ -240,15 +242,22 @@ pgcnt_t		lotsfree = 0;
 pgcnt_t		needfree = 0;
 pgcnt_t		throttlefree = 0;
 pgcnt_t		pageout_reserve = 0;
+pri_t		pageout_pri;
 
 pgcnt_t		deficit;
 pgcnt_t		nscan;
 pgcnt_t		desscan;
 
+/* kstats */
+uint64_t low_mem_scan;
+uint64_t zone_cap_scan;
+
+#define	MAX_PSCAN_THREADS	16
+
 /*
- * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
- * number of nanoseconds in each wakeup cycle that gives the equivalent of some
- * underlying %CPU duty cycle.
+ * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
+ * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
+ * that gives the equivalent of some underlying %CPU duty cycle.
  *
  * min_pageout_nsec:
  *     nanoseconds/wakeup equivalent of min_percent_cpu.
@@ -260,15 +269,31 @@ pgcnt_t		desscan;
  *     Number of nanoseconds budgeted for each wakeup cycle.
  *     Computed each time around by schedpaging().
  *     Varies between min_pageout_nsec and max_pageout_nsec,
- *     depending on memory pressure.
+ *     depending on memory pressure or zones over their cap.
+ *
+ * zone_pageout_nsec:
+ *      Number of nanoseconds budget for each cycle when a zone
+ *      is over its memory cap. If this is zero, then the value
+ *      of max_pageout_nsec is used instead.
  */
 static hrtime_t	min_pageout_nsec;
 static hrtime_t	max_pageout_nsec;
 static hrtime_t	pageout_nsec;
+static hrtime_t	zone_pageout_nsec;
 
-static uint_t	reset_hands;
+static boolean_t	reset_hands[MAX_PSCAN_THREADS];
 
 #define	PAGES_POLL_MASK	1023
+#define	SCHEDPAGING_HZ	4
+
+/*
+ * despagescanners:
+ *	The desired number of page scanner threads. The value can be set in
+ *	/etc/system or tuned directly with 'mdb -kw'.  The system will bring
+ *	the actual number of threads into line with the desired number. If set
+ *	to an invalid value, the system will correct the setting.
+ */
+uint_t despagescanners = 0;
 
 /*
  * pageout_sample_lim:
@@ -294,26 +319,29 @@ static uint_t	reset_hands;
  *     pageout_scanner(), which then sets this value once per system boot after
  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
  *     new value is used for fastscan and handspreadpages.
- *
- * sample_start, sample_end:
- *     The hrtime at which the last pageout_scanner() sample began and ended.
  */
 typedef hrtime_t hrrate_t;
 
 static uint64_t	pageout_sample_lim = 4;
 static uint64_t	pageout_sample_cnt = 0;
 static pgcnt_t	pageout_sample_pages = 0;
+static hrtime_t	pageout_sample_etime = 0;
 static hrrate_t	pageout_rate = 0;
 static pgcnt_t	pageout_new_spread = 0;
 
-static hrtime_t	pageout_cycle_nsec;
-static hrtime_t	sample_start, sample_end;
-static hrtime_t	pageout_sample_etime = 0;
+/* True if the page scanner is first starting up */
+#define	PAGE_SCAN_STARTUP	(pageout_sample_cnt < pageout_sample_lim)
+
+/* The current number of page scanner threads */
+static uint_t n_page_scanners = 1;
+/* The number of page scanner threads that are actively scanning. */
+static uint_t pageouts_running;
 
 /*
  * Record number of times a pageout_scanner() wakeup cycle finished because it
  * timed out (exceeded its CPU budget), rather than because it visited
- * its budgeted number of pages.
+ * its budgeted number of pages. This is only done when scanning under low
+ * free memory conditions, not when scanning for zones over their cap.
  */
 uint64_t	pageout_timeouts = 0;
 
@@ -357,9 +385,10 @@ static struct clockinit {
 	pgcnt_t ci_fastscan;
 	pgcnt_t ci_slowscan;
 	pgcnt_t ci_handspreadpages;
+	uint_t  ci_despagescanners;
 } clockinit = { .ci_init = false };
 
-static pgcnt_t
+static inline pgcnt_t
 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 {
 	if (value < minimum) {
@@ -382,6 +411,83 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 }
 
 /*
+ * Local boolean to control scanning when zones are over their cap. Avoids
+ * accessing the zone_num_over_cap variable except within schedpaging(), which
+ * only runs periodically. This is here only to reduce our access to
+ * zone_num_over_cap, since it is already accessed a lot during paging, and
+ * the page scanner accesses the zones_over variable on each page during a
+ * scan. There is no lock needed for zone_num_over_cap since schedpaging()
+ * doesn't modify the variable, it only cares if the variable is 0 or non-0.
+ */
+static boolean_t zones_over = B_FALSE;
+
+/*
+ * On large memory systems, multiple instances of the page scanner are run,
+ * each responsible for a separate region of memory. This speeds up page
+ * invalidation under low memory conditions.
+ *
+ * despagescanners can be set in /etc/system or via mdb and it will
+ * be used as a guide for how many page scanners to create; the value
+ * will be adjusted if it is not sensible. Otherwise, the number of
+ * page scanners is determined dynamically based on handspreadpages.
+ */
+static void
+recalc_pagescanners(void)
+{
+	pgcnt_t sz;
+	uint_t des;
+
+	/* If the initial calibration has not been done, take no action. */
+	if (pageout_new_spread == 0)
+		return;
+
+	/*
+	 * If the desired number of scanners is set in /etc/system
+	 * then try to use it.
+	 */
+	if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+		despagescanners = clockinit.ci_despagescanners;
+
+	if (despagescanners != 0) {
+		/*
+		 * We have a desired number of page scanners, either from
+		 * /etc/system or set via mdb. Try and use it (it will be
+		 * clamped below).
+		 */
+		des = despagescanners;
+	} else {
+		/*
+		 * Calculate the number of desired scanners based on the
+		 * system's memory size.
+		 *
+		 * A 64GiB region size is used as the basis for calculating how
+		 * many scanner threads should be created. For systems with up
+		 * to 64GiB of RAM, a single thread is used; for very large
+		 * memory systems the threads are limited to MAX_PSCAN_THREADS.
+		 */
+		sz = btop(64ULL << 30);
+
+		if (sz > looppages) {
+			des = 1;
+		} else {
+			pgcnt_t tmp = sz;
+
+			for (des = 1; tmp < looppages; des++)
+				tmp += sz;
+		}
+	}
+
+	/*
+	 * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+	 * and so that each scanner covers at least 10% more than
+	 * handspreadpages.
+	 */
+	des = clamp(des, 1,
+	    looppages / (handspreadpages + handspreadpages / 10));
+	despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+}
+
+/*
  * Set up the paging constants for the clock algorithm used by
  * pageout_scanner(), and by the virtual memory system overall.  See the
  * comments at the top of this file for more information about the threshold
@@ -395,7 +501,6 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 void
 setupclock(void)
 {
-	pgcnt_t defval;
 	bool half = (pageout_threshold_style == 1);
 	bool recalc = true;
 
@@ -424,6 +529,7 @@ setupclock(void)
 		clockinit.ci_fastscan = fastscan;
 		clockinit.ci_slowscan = slowscan;
 		clockinit.ci_handspreadpages = handspreadpages;
+		clockinit.ci_despagescanners = despagescanners;
 
 		/*
 		 * The first call does not trigger a recalculation, only
@@ -605,7 +711,7 @@ setupclock(void)
 	}
 
 	/*
-	 * Handspreadpages is distance (in pages) between front and back
+	 * Handspreadpages is the distance (in pages) between front and back
 	 * pageout daemon hands.  The amount of time to reclaim a page
 	 * once pageout examines it increases with this distance and
 	 * decreases as the scan rate rises. It must be < the amount
@@ -641,12 +747,31 @@ setupclock(void)
 	}
 
 	/*
-	 * If we have been called to recalculate the parameters, set a flag to
-	 * re-evaluate the clock hand pointers.
+	 * Establish the minimum and maximum length of time to be spent
+	 * scanning pages per wakeup, limiting the scanner duty cycle.  The
+	 * input percentage values (0-100) must be converted to a fraction of
+	 * the number of nanoseconds in a second of wall time, then further
+	 * scaled down by the number of scanner wakeups in a second.
 	 */
-	if (recalc) {
-		reset_hands = 1;
-	}
+	min_pageout_nsec = MAX(1,
+	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
+	max_pageout_nsec = MAX(min_pageout_nsec,
+	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
+
+	/*
+	 * If not called for recalculation, return and skip the remaining
+	 * steps.
+	 */
+	if (!recalc)
+		return;
+
+	/*
+	 * Set a flag to re-evaluate the clock hand positions.
+	 */
+	for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
+		reset_hands[i] = B_TRUE;
+
+	recalc_pagescanners();
 }
 
 /*
@@ -660,9 +785,8 @@ setupclock(void)
  * in its next pass; schedpaging() sets this value based on the amount of
  * currently available memory.
  */
-#define	SCHEDPAGING_HZ	4
 
-static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
+static kmutex_t	pageout_mutex;
 
 /*
  * Pool of available async pageout putpage requests.
@@ -690,9 +814,9 @@ static bool pageout_pushing = false;
 static uint64_t pageout_pushcount = 0;
 static uint64_t pageout_pushcount_seen = 0;
 
-static int async_list_size = 256;	/* number of async request structs */
+static int async_list_size = 8192;	/* number of async request structs */
 
-static void pageout_scanner(void);
+static void pageout_scanner(void *);
 
 /*
  * If a page is being shared more than "po_share" times
@@ -722,24 +846,17 @@ schedpaging(void *arg)
 		kcage_cageout_wakeup();
 
 	if (mutex_tryenter(&pageout_mutex)) {
-		/* pageout() not running */
+
+		if (pageouts_running != 0)
+			goto out;
+
+		/* No pageout scanner threads running. */
 		nscan = 0;
 		vavail = freemem - deficit;
 		if (pageout_new_spread != 0)
 			vavail -= needfree;
-		if (vavail < 0)
-			vavail = 0;
-		if (vavail > lotsfree)
-			vavail = lotsfree;
+		vavail = clamp(vavail, 0, lotsfree);
 
-		/*
-		 * Fix for 1161438 (CRS SPR# 73922).  All variables
-		 * in the original calculation for desscan were 32 bit signed
-		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
-		 * more of memory, the calculation can overflow.  When this
-		 * happens, desscan becomes negative and pageout_scanner()
-		 * stops paging out.
-		 */
 		if (needfree > 0 && pageout_new_spread == 0) {
 			/*
 			 * If we've not yet collected enough samples to
@@ -765,14 +882,92 @@ schedpaging(void *arg)
 		pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 		    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 
-		if (freemem < lotsfree + needfree ||
-		    pageout_sample_cnt < pageout_sample_lim) {
+		DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
+		    pageout_nsec);
+
+		if (pageout_new_spread != 0 && despagescanners != 0 &&
+		    despagescanners != n_page_scanners) {
+			/*
+			* We have finished the pagescan initialisation and the
+			* desired number of page scanners has changed, either
+			* because initialisation just finished, because of a
+			* memory DR, or because despagescanners has been
+			* modified on the fly (i.e. by mdb).
+			*/
+			uint_t i, curr_nscan = n_page_scanners;
+
+			/* Re-validate despagescanners */
+			recalc_pagescanners();
+
+			n_page_scanners = despagescanners;
+
+			for (i = 0; i < MAX_PSCAN_THREADS; i++)
+				reset_hands[i] = B_TRUE;
+
+			/* If we need more scanners, start them now. */
+			if (n_page_scanners > curr_nscan) {
+				for (i = curr_nscan; i < n_page_scanners; i++) {
+					(void) lwp_kernel_create(proc_pageout,
+					    pageout_scanner,
+					    (void *)(uintptr_t)i, TS_RUN,
+					    pageout_pri);
+				}
+			}
+
+			/*
+			 * If the number of scanners has decreased, trigger a
+			 * wakeup so that the excess threads will terminate.
+			 */
+			if (n_page_scanners < curr_nscan) {
+				WAKE_PAGEOUT_SCANNER();
+			}
+		}
+
+		zones_over = B_FALSE;
+
+		if (PAGE_SCAN_STARTUP) {
 			/*
-			 * Either we need more memory, or we still need to
-			 * measure the average scan rate.  Wake the scanner.
+			 * We still need to measure the rate at which the
+			 * system is able to scan pages of memory. Each of
+			 * these initial samples is a scan of as much system
+			 * memory as practical, regardless of whether or not we
+			 * are experiencing memory pressure.
 			 */
-			DTRACE_PROBE(pageout__cv__signal);
-			cv_signal(&proc_pageout->p_cv);
+			desscan = total_pages;
+			pageout_nsec = max_pageout_nsec;
+
+			DTRACE_PROBE(schedpage__wake__sample);
+			WAKE_PAGEOUT_SCANNER();
+		} else if (freemem < lotsfree + needfree) {
+			/*
+			 * We need more memory.
+			 */
+			low_mem_scan++;
+
+			DTRACE_PROBE(schedpage__wake__low);
+			WAKE_PAGEOUT_SCANNER();
+		} else if (zone_num_over_cap > 0) {
+			/*
+			 * One of more zones are over their cap.
+			 */
+
+			/* No page limit */
+			desscan = total_pages;
+
+			/*
+			* Increase the scanning CPU% to the max. This implies
+			* 80% of one CPU/sec if the scanner can run each
+			* opportunity. Can also be tuned via setting
+			* zone_pageout_nsec in /etc/system or with mdb.
+			*/
+			pageout_nsec = (zone_pageout_nsec != 0) ?
+			   zone_pageout_nsec : max_pageout_nsec;
+
+			zones_over = B_TRUE;
+			zone_cap_scan++;
+
+			DTRACE_PROBE(schedpage__wake__zone);
+			WAKE_PAGEOUT_SCANNER();
 		} else {
 			/*
 			 * There are enough free pages, no need to
@@ -785,6 +980,7 @@ schedpaging(void *arg)
 				po_share >>= 1;
 			}
 		}
+out:
 		mutex_exit(&pageout_mutex);
 	}
 
@@ -813,37 +1009,39 @@ uint_t dopageout = 1;
 /*
  * The page out daemon, which runs as process 2.
  *
- * As long as there are at least lotsfree pages,
- * this process is not run.  When the number of free
- * pages stays in the range desfree to lotsfree,
- * this daemon runs through the pages in the loop
- * at a rate determined in schedpaging().  Pageout manages
- * two hands on the clock.  The front hand moves through
- * memory, clearing the reference bit,
- * and stealing pages from procs that are over maxrss.
- * The back hand travels a distance behind the front hand,
- * freeing the pages that have not been referenced in the time
- * since the front hand passed.  If modified, they are pushed to
- * swap before being freed.
+ * The daemon treats physical memory as a circular array of pages and scans
+ * the pages using a 'two-handed clock' algorithm. The front hand moves
+ * through the pages, clearing the reference bit. The back hand travels a
+ * distance (handspreadpages) behind the front hand, freeing the pages that
+ * have not been referenced in the time since the front hand passed. If
+ * modified, they are first written to their backing store before being
+ * freed.
+ *
+ * In order to make page invalidation more responsive on machines with
+ * larger memory, multiple pageout_scanner threads may be created. In this
+ * case, each thread is given a segment of the memory "clock face" so that
+ * memory can be reclaimed more quickly.
  *
- * There are 2 threads that act on behalf of the pageout process.
- * One thread scans pages (pageout_scanner) and frees them up if
- * they don't require any VOP_PUTPAGE operation. If a page must be
- * written back to its backing store, the request is put on a list
- * and the other (pageout) thread is signaled. The pageout thread
- * grabs VOP_PUTPAGE requests from the list, and processes them.
- * Some filesystems may require resources for the VOP_PUTPAGE
- * operations (like memory) and hence can block the pageout
- * thread, but the scanner thread can still operate. There is still
- * no guarantee that memory deadlocks cannot occur.
+ * As long as there are at least lotsfree pages, or no zones over their
+ * cap, then pageout_scanner threads are not run. When pageout_scanner
+ * threads are running for case (a), all pages are considered for pageout.
+ * For case (b), only pages belonging to a zone over its cap will be
+ * considered for pageout.
  *
- * For now, this thing is in very rough form.
+ * There are multiple threads that act on behalf of the pageout process. A
+ * set of threads scan pages (pageout_scanner) and frees them up if they
+ * don't require any VOP_PUTPAGE operation. If a page must be written back
+ * to its backing store, the request is put on a list and the other
+ * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
+ * requests from the list, and processes them. Some filesystems may require
+ * resources for the VOP_PUTPAGE operations (like memory) and hence can
+ * block the pageout thread, but the scanner thread can still operate.
+ * There is still no guarantee that memory deadlocks cannot occur.
  */
 void
 pageout()
 {
 	struct async_reqs *arg;
-	pri_t pageout_pri;
 	int i;
 	pgcnt_t max_pushes;
 	callb_cpr_t cprinfo;
@@ -874,11 +1072,12 @@ pageout()
 		push_req[i].a_next = &push_req[i + 1];
 	}
 
-	pageout_pri = curthread->t_pri;
+	pageout_pri = curthread->t_pri - 1;
 
-	/* Create the pageout scanner thread. */
-	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
-	    pageout_pri - 1);
+	/* Create the first pageout scanner thread. */
+	(void) lwp_kernel_create(proc_pageout, pageout_scanner,
+	    (void *)0,	/* this is instance 0, not NULL */
+	    TS_RUN, pageout_pri);
 
 	/*
 	 * kick off pageout scheduler.
@@ -913,6 +1112,8 @@ pageout()
 		pageout_pushing = true;
 		mutex_exit(&push_lock);
 
+		DTRACE_PROBE(pageout__push);
+
 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 			pushes++;
@@ -935,14 +1136,19 @@ pageout()
  * Kernel thread that scans pages looking for ones to free
  */
 static void
-pageout_scanner(void)
+pageout_scanner(void *a)
 {
-	struct page *fronthand, *backhand;
+	struct page *fronthand, *backhand, *fronthandstart;
+	struct page *regionstart, *regionend;
 	uint_t laps;
 	callb_cpr_t cprinfo;
-	pgcnt_t	nscan_limit;
+	pgcnt_t	nscan_cnt, tick;
 	pgcnt_t	pcount;
-	bool sampling;
+	bool bhwrapping, fhwrapping;
+	hrtime_t sample_start, sample_end;
+	uint_t inst = (uint_t)(uintptr_t)a;
+
+	VERIFY3U(inst, <, MAX_PSCAN_THREADS);
 
 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 	mutex_enter(&pageout_mutex);
@@ -952,113 +1158,153 @@ pageout_scanner(void)
 	 * the right point on the assumption that after one circuit things
 	 * will have settled down, and restarts shouldn't be that often.
 	 */
+	reset_hands[inst] = B_TRUE;
 
-	/*
-	 * Set the two clock hands to be separated by a reasonable amount,
-	 * but no more than 360 degrees apart.
-	 */
-	backhand = page_first();
-	if (handspreadpages >= total_pages) {
-		fronthand = page_nextn(backhand, total_pages - 1);
-	} else {
-		fronthand = page_nextn(backhand, handspreadpages);
-	}
-
-	/*
-	 * Establish the minimum and maximum length of time to be spent
-	 * scanning pages per wakeup, limiting the scanner duty cycle.  The
-	 * input percentage values (0-100) must be converted to a fraction of
-	 * the number of nanoseconds in a second of wall time, then further
-	 * scaled down by the number of scanner wakeups in a second:
-	 */
-	min_pageout_nsec = MAX(1,
-	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
-	max_pageout_nsec = MAX(min_pageout_nsec,
-	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
+	pageouts_running++;
+	mutex_exit(&pageout_mutex);
 
 loop:
 	cv_signal_pageout();
 
+	mutex_enter(&pageout_mutex);
+	pageouts_running--;
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+	pageouts_running++;
+	mutex_exit(&pageout_mutex);
 
 	/*
-	 * Check if pageout has been disabled for debugging purposes:
+	 * Check if pageout has been disabled for debugging purposes.
 	 */
 	if (!dopageout) {
 		goto loop;
 	}
 
 	/*
-	 * One may reset the clock hands for debugging purposes.  Hands will
-	 * also be reset if memory is added to or removed from the system.
+	 * One may reset the clock hands and scanned region for debugging
+	 * purposes. Hands will also be reset on first thread startup, if
+	 * the number of scanning threads (n_page_scanners) changes, or if
+	 * memory is added to, or removed from, the system.
 	 */
-	if (reset_hands) {
-		reset_hands = 0;
+	if (reset_hands[inst]) {
+		struct page *first;
+
+		reset_hands[inst] = B_FALSE;
+
+		if (inst >= n_page_scanners) {
+			/*
+			* The desired number of page scanners has been
+			* reduced and this instance is no longer wanted.
+			* Exit the lwp.
+			*/
+			VERIFY3U(inst, !=, 0);
+			DTRACE_PROBE1(pageout__exit, uint_t, inst);
+			mutex_enter(&pageout_mutex);
+			pageouts_running--;
+			mutex_exit(&pageout_mutex);
+			mutex_enter(&curproc->p_lock);
+			lwp_exit();
+			/* NOTREACHED */
+		}
+
+		first = page_first();
+
+		/*
+		 * Each scanner thread gets its own sector of the memory
+		 * clock face.
+		 */
+		pgcnt_t span, offset;
 
-		backhand = page_first();
-		if (handspreadpages >= total_pages) {
-			fronthand = page_nextn(backhand, total_pages - 1);
+		span = looppages / n_page_scanners;
+		VERIFY3U(span, >, handspreadpages);
+
+		offset = inst * span;
+		regionstart = page_nextn(first, offset);
+		if (inst == n_page_scanners - 1) {
+			/* The last instance goes up to the last page */
+			regionend = page_nextn(first, looppages - 1);
 		} else {
-			fronthand = page_nextn(backhand, handspreadpages);
+			regionend = page_nextn(regionstart, span - 1);
 		}
+
+		backhand = regionstart;
+		fronthand = page_nextn(backhand, handspreadpages);
+		tick = 1;
+
+		bhwrapping = fhwrapping = B_FALSE;
+
+		DTRACE_PROBE4(pageout__reset, uint_t, inst,
+		    pgcnt_t, regionstart, pgcnt_t, regionend,
+		    pgcnt_t, fronthand);
 	}
 
+	/*
+	 * This CPU kstat is only incremented here and we're obviously
+	 * on this CPU, so no lock.
+	 */
 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 
 	/*
 	 * Keep track of the number of times we have scanned all the way around
-	 * the loop:
+	 * the loop on this wakeup.
 	 */
 	laps = 0;
 
-	DTRACE_PROBE(pageout__start);
-
 	/*
 	 * Track the number of pages visited during this scan so that we can
 	 * periodically measure our duty cycle.
 	 */
+	nscan_cnt = 0;
 	pcount = 0;
 
-	if (pageout_sample_cnt < pageout_sample_lim) {
-		/*
-		 * We need to measure the rate at which the system is able to
-		 * scan pages of memory.  Each of these initial samples is a
-		 * scan of all system memory, regardless of whether or not we
-		 * are experiencing memory pressure.
-		 */
-		nscan_limit = total_pages;
-		sampling = true;
-	} else {
-		nscan_limit = desscan;
-		sampling = false;
-	}
+	DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
+	    hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
+
+	/*
+	 * Record the initial position of the front hand for this cycle so
+	 * that we can detect when the hand wraps around.
+	 */
+	fronthandstart = fronthand;
 
 	sample_start = gethrtime();
 
 	/*
 	 * Scan the appropriate number of pages for a single duty cycle.
 	 */
-	while (nscan < nscan_limit) {
+	while (nscan_cnt < desscan) {
 		checkpage_result_t rvfront, rvback;
 
-		if (!sampling && freemem >= lotsfree + needfree) {
+		/*
+		 * Only scan while at least one of these is true:
+		 *  1) one or more zones is over its cap
+		 *  2) there is not enough free memory
+		 *  3) during page scan startup when determining sample data
+		 */
+		if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
+		    !zones_over) {
 			/*
 			 * We are not sampling and enough memory has become
 			 * available that scanning is no longer required.
 			 */
+			DTRACE_PROBE1(pageout__memfree, uint_t, inst);
 			break;
 		}
 
+		DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
+
 		/*
 		 * Periodically check to see if we have exceeded the CPU duty
 		 * cycle for a single wakeup.
 		 */
 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
+			hrtime_t pageout_cycle_nsec;
+
 			pageout_cycle_nsec = gethrtime() - sample_start;
 			if (pageout_cycle_nsec >= pageout_nsec) {
-				++pageout_timeouts;
+				if (!zones_over)
+					atomic_inc_64(&pageout_timeouts);
+				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
 				break;
 			}
 		}
@@ -1077,7 +1323,8 @@ loop:
 		++pcount;
 
 		/*
-		 * Protected by pageout_mutex instead of cpu_stat_lock:
+		 * This CPU kstat is only incremented here and we're obviously
+		 * on this CPU, so no lock.
 		 */
 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
 
@@ -1085,26 +1332,48 @@ loop:
 		 * Don't include ineligible pages in the number scanned.
 		 */
 		if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
-			nscan++;
+			nscan_cnt++;
 		}
 
-		backhand = page_next(backhand);
-		fronthand = page_next(fronthand);
+		if (bhwrapping) {
+			backhand = regionstart;
+			bhwrapping = B_FALSE;
+		} else {
+			backhand = page_nextn(backhand, tick);
+			if (backhand == regionend)
+				bhwrapping = B_TRUE;
+		}
+
+		if (fhwrapping) {
+			fronthand = regionstart;
+			fhwrapping = B_FALSE;
+		} else {
+			fronthand = page_nextn(fronthand, tick);
+			if (fronthand == regionend)
+				fhwrapping = B_TRUE;
+		}
 
 		/*
-		 * The front hand has wrapped around to the first page in the
-		 * loop.
+		 * The front hand has wrapped around during this wakeup.
 		 */
-		if (fronthand == page_first()) {
+		if (fronthand == fronthandstart) {
 			laps++;
-			DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
+			DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
+			    uint_t, laps);
 
 			/*
-			 * Protected by pageout_mutex instead of cpu_stat_lock:
+			 * This CPU kstat is only incremented here and we're
+			 * obviously on this CPU, so no lock.
 			 */
 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
 
-			if (laps > 1) {
+			/*
+			 * then when we wraparound memory we want to try to
+			 * reclaim more pages.
+			 * If scanning only because zones are over their cap,
+			 * then wrapping is common and we simply keep going.
+			*/
+			if (laps > 1 && freemem < lotsfree + needfree) {
 				/*
 				 * Extremely unlikely, but it happens.
 				 * We went around the loop at least once
@@ -1123,21 +1392,30 @@ loop:
 	}
 
 	sample_end = gethrtime();
+	atomic_add_long(&nscan, nscan_cnt);
 
-	DTRACE_PROBE1(pageout__end, uint_t, laps);
+	DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
+	    pgcnt_t, nscan_cnt, pgcnt_t, pcount)
 
+	/*
+	 * The global variables used below are only modified by this thread and
+	 * only during initial scanning when there is a single page scanner
+	 * thread running.
+	 */
 	if (pageout_new_spread == 0) {
-		if (pageout_sample_cnt < pageout_sample_lim) {
+		VERIFY3U(inst, ==, 0);
+
+		if (PAGE_SCAN_STARTUP) {
 			/*
 			 * Continue accumulating samples until we have enough
-			 * to get a reasonable value for average scan rate:
+			 * to get a reasonable value for average scan rate.
 			 */
 			pageout_sample_pages += pcount;
 			pageout_sample_etime += sample_end - sample_start;
 			++pageout_sample_cnt;
 		}
 
-		if (pageout_sample_cnt >= pageout_sample_lim) {
+		if (!PAGE_SCAN_STARTUP) {
 			/*
 			 * We have enough samples, set the spread.
 			 */
@@ -1223,6 +1501,7 @@ checkpage(struct page *pp, pageout_hand_t whichhand)
 	int isfs = 0;
 	int isexec = 0;
 	int pagesync_flag;
+	zoneid_t zid = ALL_ZONES;
 
 	/*
 	 * Skip pages:
@@ -1265,6 +1544,21 @@ checkpage(struct page *pp, pageout_hand_t whichhand)
 		return (CKP_INELIGIBLE);
 	}
 
+	if (zones_over) {
+		ASSERT(pp->p_zoneid == ALL_ZONES ||
+		    pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
+		if (pp->p_zoneid == ALL_ZONES ||
+		    zone_pdata[pp->p_zoneid].zpers_over == 0) {
+			/*
+			* Cross-zone shared page, or zone not over it's cap.
+			* Leave the page alone.
+			*/
+			page_unlock(pp);
+			return (CKP_INELIGIBLE);
+		}
+		zid = pp->p_zoneid;
+	}
+
 	/*
 	 * Maintain statistics for what we are freeing
 	 */
@@ -1372,6 +1666,11 @@ recheck:
 			VN_RELE(vp);
 			return (CKP_NOT_FREED);
 		}
+		if (isfs) {
+			zone_pageout_stat(zid, ZPO_DIRTY);
+		} else {
+			zone_pageout_stat(zid, ZPO_ANONDIRTY);
+		}
 		return (CKP_FREED);
 	}
 
@@ -1398,8 +1697,10 @@ recheck:
 		} else {
 			CPU_STATS_ADD_K(vm, fsfree, 1);
 		}
+		zone_pageout_stat(zid, ZPO_FS);
 	} else {
 		CPU_STATS_ADD_K(vm, anonfree, 1);
+		zone_pageout_stat(zid, ZPO_ANON);
 	}
 
 	return (CKP_FREED);
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index 7d2b89408a..933834aee9 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1629,7 +1629,7 @@ vmem_destroy(vmem_t *vmp)
 
 	leaked = vmem_size(vmp, VMEM_ALLOC);
 	if (leaked != 0)
-		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
 		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
 		    "identifiers" : "bytes");
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index a398830833..fa841df9ff 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  */
@@ -106,14 +106,16 @@
  *   removed from the list of active zones.  zone_destroy() returns, and
  *   the zone can be recreated.
  *
- *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
- *   callbacks are executed, and all memory associated with the zone is
- *   freed.
+ *   ZONE_IS_FREE (internal state): All references have been dropped and
+ *   the zone_t is no longer in the zone_active nor zone_deathrow lists.
+ *   The zone_t is in the process of being freed.  This state exists
+ *   only for publishing a sysevent to indicate that the zone by this
+ *   name can be booted again.
  *
- *   Threads can wait for the zone to enter a requested state by using
- *   zone_status_wait() or zone_status_timedwait() with the desired
- *   state passed in as an argument.  Zone state transitions are
- *   uni-directional; it is not possible to move back to an earlier state.
+ *   Threads can wait for the zone to enter a requested state (other than
+ *   ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
+ *   with the desired state passed in as an argument.  Zone state transitions
+ *   are uni-directional; it is not possible to move back to an earlier state.
  *
  *
  *   Zone-Specific Data:
@@ -252,6 +254,8 @@
 #include <sys/cpucaps.h>
 #include <vm/seg.h>
 #include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
 
 /*
  * This constant specifies the number of seconds that threads waiting for
@@ -312,6 +316,7 @@ static id_space_t *zoneid_space;
  * 'global_zone'.
  */
 zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
 
 /*
@@ -327,8 +332,8 @@ static list_t zone_active;
 static list_t zone_deathrow;
 static kmutex_t zone_deathrow_lock;
 
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
 
 /* Event channel to sent zone state change notifications */
 evchan_t *zone_event_chan;
@@ -350,6 +355,7 @@ const char  *zone_status_table[] = {
 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
 	ZONE_EVENT_UNINITIALIZED,	/* dead */
+	ZONE_EVENT_FREE,		/* free */
 };
 
 /*
@@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = {
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;
 rctl_hndl_t rc_zone_shmmax;
@@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t);
 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 static int zone_set_network(zoneid_t, zone_net_data_t *);
 static int zone_get_network(zoneid_t, zone_net_data_t *);
+static void zone_status_set(zone_t *, zone_status_t);
 
 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 
@@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
  * Version 5 alters the zone_boot system call, and converts its old
  *     bootargs parameter to be set by the zone_setattr API instead.
  * Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
  */
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
+
+/*
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ *   1) pages and RSS data associated with processes inside a zone
+ *   2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ *    associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ *    instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpers_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
+ * the "zone_pdata" array and associated counter.
+ *
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
+ */
+uint_t zone_num_over_cap;
+zone_persist_t zone_pdata[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
 
 /*
  * Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+	rcop_no_action,
+	zone_cpu_base_get,
+	zone_cpu_base_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+	rcop_no_action,
+	zone_cpu_burst_time_get,
+	zone_cpu_burst_time_set,
+	rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+	rctl_qty_t r = 0;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+	mutex_exit(&zp->zpers_zfs_lock);
+
+	return (r);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+	zone_persist_t *zp;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set priority to the new value.
+	 */
+	zp = &zone_pdata[zone->zone_id];
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+	mutex_exit(&zp->zpers_zfs_lock);
+	return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+	rcop_no_action,
+	zone_zfs_io_pri_get,
+	zone_zfs_io_pri_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
 {
 	rctl_qty_t nlwps;
@@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	q = ptob(zp->zpers_pg_cnt);
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zoneid_t zid;
+	uint_t pg_val;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	zid = e->rcep_p.zone->zone_id;
+	if (nv == UINT64_MAX) {
+		pg_val = UINT32_MAX;
+	} else {
+		uint64_t pages = btop(nv);
+
+		/*
+		 * Return from RCTLOP_SET is always ignored so just clamp an
+		 * out-of-range value to our largest "limited" value.
+		 */
+		if (pages >= UINT32_MAX) {
+			pg_val = UINT32_MAX - 1;
+		} else {
+			pg_val = (uint_t)pages;
+		}
+	}
+	zone_pdata[zid].zpers_pg_limit = pg_val;
+	return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+	rcop_no_action,
+	zone_phys_mem_usage,
+	zone_phys_mem_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
@@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
+	return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
 }
 
 static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
     int (*updatefunc) (kstat_t *, int))
 {
 	kstat_t *ksp;
@@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name,
 	return (ksp);
 }
 
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_vfs_kstat_t *zvp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the VFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the slow ops
+	 * counters are updated directly by the VFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zvp->zv_nread.value.ui64 = kiop->nread;
+	zvp->zv_reads.value.ui64 = kiop->reads;
+	zvp->zv_rtime.value.ui64 = kiop->rtime;
+	zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+	zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+	zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+	zvp->zv_writes.value.ui64 = kiop->writes;
+	zvp->zv_wtime.value.ui64 = kiop->wtime;
+	zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+	zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_vfs_kstat_t *zvp;
+
+	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_vfs_lock;
+	zone->zone_vfs_stats = zvp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_vfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_zfs_kstat_t *zzp = ksp->ks_data;
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp == NULL) {
+		zzp->zz_nread.value.ui64 = 0;
+		zzp->zz_reads.value.ui64 = 0;
+		zzp->zz_rtime.value.ui64 = 0;
+		zzp->zz_rlentime.value.ui64 = 0;
+		zzp->zz_nwritten.value.ui64 = 0;
+		zzp->zz_writes.value.ui64 = 0;
+		zzp->zz_waittime.value.ui64 = 0;
+	} else {
+		kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+		/*
+		 * Extract the ZFS statistics from the kstat_io_t structure
+		 * used by kstat_runq_enter() and related functions. Since the
+		 * I/O throttle counters are updated directly by the ZFS layer,
+		 * there's no need to copy those statistics here.
+		 *
+		 * Note that kstat_runq_enter() and the related functions use
+		 * gethrtime_unscaled(), so scale the time here.
+		 */
+		zzp->zz_nread.value.ui64 = kiop->nread;
+		zzp->zz_reads.value.ui64 = kiop->reads;
+		zzp->zz_rtime.value.ui64 = kiop->rtime;
+		zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+		zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+		zzp->zz_writes.value.ui64 = kiop->writes;
+		zzp->zz_waittime.value.ui64 =
+		    zp->zpers_zfsp->zpers_zfs_rd_waittime;
+	}
+	mutex_exit(&zp->zpers_zfs_lock);
+
+	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_zfs_kstat_t *zzp;
+
+	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_zfs_lock;
+	zone->zone_zfs_stats = zzp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_zfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
 
 static int
 zone_mcap_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
 	zone_mcap_kstat_t *zmp = ksp->ks_data;
+	zone_persist_t *zp;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
+	zp = &zone_pdata[zone->zone_id];
+
+	zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
+	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+	zmp->zm_nover.value.ui64 = zp->zpers_nover;
+#ifndef DEBUG
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
+#else
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+	    zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
+#endif
 	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
 	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
 	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone)
 	/* The kstat "name" field is not large enough for a full zonename */
 	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
 	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
@@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
 	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
 	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
 
+	zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
 	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
 
 	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
+	zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
 	zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
 
 	return (0);
@@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone)
 	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
 	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
+	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
 
 	ksp->ks_update = zone_misc_kstat_update;
@@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone)
 static void
 zone_kstat_create(zone_t *zone)
 {
-	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
 	    "lockedmem", zone_lockedmem_kstat_update);
-	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
 	    "swapresv", zone_swapresv_kstat_update);
-	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+	    "physicalmem", zone_physmem_kstat_update);
+	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
 	    "nprocs", zone_nprocs_kstat_update);
 
+	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+		zone->zone_vfs_stats = kmem_zalloc(
+		    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+		zone->zone_zfs_stats = kmem_zalloc(
+		    sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	}
+
 	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
 		zone->zone_mcap_stats = kmem_zalloc(
 		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone)
 	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
 	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_physmem_kstat,
+	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
 	    sizeof (zone_kstat_t));
+
+	zone_kstat_delete_common(&zone->zone_vfs_ksp,
+	    sizeof (zone_vfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_zfs_ksp,
+	    sizeof (zone_zfs_kstat_t));
 	zone_kstat_delete_common(&zone->zone_mcap_ksp,
 	    sizeof (zone_mcap_kstat_t));
 	zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2101,8 +2579,12 @@ zone_zsd_init(void)
 	zone0.zone_initname = initname;
 	zone0.zone_lockedmem_kstat = NULL;
 	zone0.zone_swapresv_kstat = NULL;
+	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
 
+	zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+	zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
 	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
 	    offsetof(zone_ref_t, zref_linkage));
 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2209,6 +2691,21 @@ zone_init(void)
 	    RCTL_GLOBAL_INFINITE,
 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
+	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    16384, 16384, &zone_zfs_io_pri_ops);
+
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2250,6 +2747,20 @@ zone_init(void)
 	rde = rctl_dict_lookup("zone.cpu-shares");
 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
 
+	/*
+	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
+	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+	 */
+	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+	bzero(dval, sizeof (rctl_val_t));
+	dval->rcv_value = 1;
+	dval->rcv_privilege = RCPRIV_PRIVILEGED;
+	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+	dval->rcv_action_recip_pid = -1;
+
+	rde = rctl_dict_lookup("zone.zfs-io-priority");
+	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2260,6 +2771,11 @@ zone_init(void)
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_max_swap_ops);
 
+	rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_phys_mem_ops);
+
 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2283,6 +2799,7 @@ zone_init(void)
 	zone0.zone_restart_init = B_TRUE;
 	zone0.zone_reboot_on_init_exit = B_FALSE;
 	zone0.zone_restart_init_0 = B_FALSE;
+	zone0.zone_init_status = -1;
 	zone0.zone_brand = &native_brand;
 	rctl_prealloc_destroy(gp);
 	/*
@@ -2364,6 +2881,8 @@ zone_init(void)
 static void
 zone_free(zone_t *zone)
 {
+	zone_dl_t *zdl;
+
 	ASSERT(zone != global_zone);
 	ASSERT(zone->zone_ntasks == 0);
 	ASSERT(zone->zone_nlwps == 0);
@@ -2379,6 +2898,9 @@ zone_free(zone_t *zone)
 	 */
 	cpucaps_zone_remove(zone);
 
+	/* Clear physical memory capping data. */
+	bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
+
 	ASSERT(zone->zone_cpucap == NULL);
 
 	/* remove from deathrow list */
@@ -2392,8 +2914,30 @@ zone_free(zone_t *zone)
 	list_destroy(&zone->zone_ref_list);
 	zone_free_zsd(zone);
 	zone_free_datasets(zone);
+
+	/*
+	 * While dlmgmtd should have removed all of these, it could have left
+	 * something behind or crashed. In which case it's not safe for us to
+	 * assume that the list is empty which list_destroy() will ASSERT. We
+	 * clean up for our userland comrades which may have crashed, or worse,
+	 * been disabled by SMF.
+	 */
+	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+		if (zdl->zdl_net != NULL)
+			nvlist_free(zdl->zdl_net);
+		kmem_free(zdl, sizeof (zone_dl_t));
+	}
 	list_destroy(&zone->zone_dl_list);
 
+	/*
+	 * This zone_t can no longer inhibit creation of another zone_t
+	 * with the same name or debug ID.  Generate a sysevent so that
+	 * userspace tools know it is safe to carry on.
+	 */
+	mutex_enter(&zone_status_lock);
+	zone_status_set(zone, ZONE_IS_FREE);
+	mutex_exit(&zone_status_lock);
+
 	cpu_uarray_free(zone->zone_ustate);
 
 	if (zone->zone_rootvp != NULL)
@@ -2438,11 +2982,17 @@ zone_free(zone_t *zone)
 static void
 zone_status_set(zone_t *zone, zone_status_t status)
 {
+	timestruc_t now;
+	uint64_t t;
 
 	nvlist_t *nvl = NULL;
 	ASSERT(MUTEX_HELD(&zone_status_lock));
-	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
-	    status >= zone_status_get(zone));
+	ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
+	    status == ZONE_IS_FREE) && status >= zone_status_get(zone));
+
+	/* Current time since Jan 1 1970 but consumers expect NS */
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
@@ -2451,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status)
 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
 	    zone_status_table[zone->zone_status]) ||
 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
-	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
 #ifdef DEBUG
 		(void) printf(
 		    "Failed to allocate and send zone state change event.\n");
+#else
+		/* EMPTY */
 #endif
 	}
 	nvlist_free(nvl);
@@ -2476,6 +3028,38 @@ zone_status_get(zone_t *zone)
 	return (zone->zone_status);
 }
 
+/*
+ * Publish a zones-related sysevent for purposes other than zone state changes.
+ * While it is unfortunate that zone_event_chan is associated with
+ * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
+ * the only ones with class "status" and subclass "change".
+ */
+void
+zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
+    nvlist_t *ev_nvl)
+{
+	nvlist_t *nvl = NULL;
+	timestruc_t now;
+	uint64_t t;
+
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
+	if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
+	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
+	    nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
+	    sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
+	    "kernel", nvl, EVCH_SLEEP) != 0) {
+#ifdef DEBUG
+		(void) printf("Failed to allocate and send zone misc event.\n");
+#else
+		/* EMPTY */
+#endif
+	}
+	nvlist_free(nvl);
+}
+
 static int
 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
 {
@@ -2529,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand)
 		return (EINVAL);
 	}
 
-	/* set up the brand specific data */
+	/*
+	 * Set up the brand specific data.
+	 * Note that it's possible that the hook has to drop the
+	 * zone_status_lock and reaquire it before returning so we can't
+	 * assume the lock has been held the entire time.
+	 */
 	zone->zone_brand = bp;
-	ZBROP(zone)->b_init_brand_data(zone);
+	ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
 
 	mutex_exit(&zone_status_lock);
 	return (0);
@@ -2604,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 }
 
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
-{
-	uint64_t mcap;
-	int err = 0;
-
-	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-		zone->zone_phys_mcap = mcap;
-
-	return (err);
-}
-
-static int
 zone_set_sched_class(zone_t *zone, const char *new_class)
 {
 	char sched_class[PC_CLNMSZ];
@@ -3022,6 +3599,12 @@ getzoneid(void)
 	return (curproc->p_zone->zone_id);
 }
 
+zoneid_t
+getzonedid(void)
+{
+	return (curproc->p_zone->zone_did);
+}
+
 /*
  * Internal versions of zone_find_by_*().  These don't zone_hold() or
  * check the validity of a zone's state.
@@ -3768,6 +4351,17 @@ zone_start_init(void)
 	 */
 	z->zone_proc_initpid = p->p_pid;
 
+	if (z->zone_setup_app_contract == B_TRUE) {
+		/*
+		 * Normally a process cannot modify its own contract, but we're
+		 * just starting the zone's init process and its contract is
+		 * always initialized from the sys_process_tmpl template, so
+		 * this is the simplest way to setup init's contract to kill
+		 * the process if any other process in the contract exits.
+		 */
+		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+	}
+
 	/*
 	 * We maintain zone_boot_err so that we can return the cause of the
 	 * failure back to the caller of the zone_boot syscall.
@@ -3796,9 +4390,54 @@ zone_start_init(void)
 			lwp_exit();
 		}
 	} else {
+		id_t cid = curthread->t_cid;
+
 		if (zone_status_get(z) == ZONE_IS_BOOTING)
 			zone_status_set(z, ZONE_IS_RUNNING);
 		mutex_exit(&zone_status_lock);
+
+		mutex_enter(&class_lock);
+		ASSERT(cid < loaded_classes);
+		if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+		    z->zone_fixed_hipri) {
+			/*
+			 * If the zone is using FX then by default all
+			 * processes start at the lowest priority and stay
+			 * there. We provide a mechanism for the zone to
+			 * indicate that it should run at "high priority". In
+			 * this case we setup init to run at the highest FX
+			 * priority (which is one level higher than the
+			 * non-fixed scheduling classes can use).
+			 */
+			pcparms_t pcparms;
+
+			pcparms.pc_cid = cid;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+			    FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+			    FX_DOUPRILIM | FX_DOUPRI;
+
+			mutex_enter(&pidlock);
+			mutex_enter(&curproc->p_lock);
+
+			(void) parmsset(&pcparms, curthread);
+
+			mutex_exit(&curproc->p_lock);
+			mutex_exit(&pidlock);
+		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+			/*
+			 * zsched always starts the init lwp at priority
+			 * minclsyspri - 1. This priority gets set in t_pri and
+			 * is invalid for RT, but RT never uses t_pri. However
+			 * t_pri is used by procfs, so we always see processes
+			 * within an RT zone with an invalid priority value.
+			 * We fix that up now.
+			 */
+			curthread->t_pri = RTGPPRIO0;
+		}
+		mutex_exit(&class_lock);
+
 		/* cause the process to return to userland. */
 		lwp_rtt();
 	}
@@ -3839,7 +4478,11 @@ zsched(void *arg)
 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
 	PTOU(pp)->u_argc = 0;
 	PTOU(pp)->u_argv = 0;
+	PTOU(pp)->u_argvstrs = 0;
+	PTOU(pp)->u_argvstrsize = 0;
 	PTOU(pp)->u_envp = 0;
+	PTOU(pp)->u_envstrs = 0;
+	PTOU(pp)->u_envstrsize = 0;
 	PTOU(pp)->u_commpagep = 0;
 	closeall(P_FINFO(pp));
 
@@ -4284,8 +4927,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
 
 		error = EINVAL;
 		name = nvpair_name(nvp);
-		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
-		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
 			goto out;
 		}
 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4404,7 +5048,7 @@ zone_create(const char *zone_name, const char *zone_root,
     caddr_t rctlbuf, size_t rctlbufsz,
     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
     int match, uint32_t doi, const bslabel_t *label,
-    int flags)
+    int flags, zoneid_t zone_did)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -4476,6 +5120,7 @@ zone_create(const char *zone_name, const char *zone_root,
 
 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
 	zone->zone_id = zoneid;
+	zone->zone_did = zone_did;
 	zone->zone_status = ZONE_IS_UNINITIALIZED;
 	zone->zone_pool = pool_default;
 	zone->zone_pool_mod = gethrtime();
@@ -4485,6 +5130,7 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_restart_init = B_TRUE;
 	zone->zone_reboot_on_init_exit = B_FALSE;
 	zone->zone_restart_init_0 = B_FALSE;
+	zone->zone_init_status = -1;
 	zone->zone_brand = &native_brand;
 	zone->zone_initname = NULL;
 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4551,8 +5197,13 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_max_swap_ctl = UINT64_MAX;
 	zone->zone_max_lofi = 0;
 	zone->zone_max_lofi_ctl = UINT64_MAX;
-	zone0.zone_lockedmem_kstat = NULL;
-	zone0.zone_swapresv_kstat = NULL;
+	zone->zone_lockedmem_kstat = NULL;
+	zone->zone_swapresv_kstat = NULL;
+	zone->zone_physmem_kstat = NULL;
+
+	zone_pdata[zoneid].zpers_zfsp =
+	    kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+	zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
 
 	zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
 
@@ -4561,6 +5212,13 @@ zone_create(const char *zone_name, const char *zone_root,
 	 */
 	zone->zone_rctls = NULL;
 
+	/*
+	 * Ensure page count is 0 (in case zoneid has wrapped).
+	 * Initialize physical memory cap as unlimited.
+	 */
+	zone_pdata[zoneid].zpers_pg_cnt = 0;
+	zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
+
 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
 		zone_free(zone);
 		return (zone_create_error(error, 0, extended_error));
@@ -4709,8 +5367,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	/*
 	 * The process, task, and project rctls are probably wrong;
 	 * we need an interface to get the default values of all rctls,
-	 * and initialize zsched appropriately.  I'm not sure that that
-	 * makes much of a difference, though.
+	 * and initialize zsched appropriately. However, we allow zoneadmd
+	 * to pass down both zone and project rctls for the zone's init.
 	 */
 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
 	if (error != 0) {
@@ -4849,6 +5507,7 @@ zone_boot(zoneid_t zoneid)
 static int
 zone_empty(zone_t *zone)
 {
+	int cnt = 0;
 	int waitstatus;
 
 	/*
@@ -4859,7 +5518,16 @@ zone_empty(zone_t *zone)
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 	while ((waitstatus = zone_status_timedwait_sig(zone,
 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
-		killall(zone->zone_id);
+		boolean_t force = B_FALSE;
+
+		/* Every 30 seconds, try harder */
+		if (cnt++ >= 30) {
+			cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+			    zone->zone_id);
+			force = B_TRUE;
+			cnt = 0;
+		}
+		killall(zone->zone_id, force);
 	}
 	/*
 	 * return EINTR if we were signaled
@@ -5188,6 +5856,7 @@ zone_destroy(zoneid_t zoneid)
 	zone_status_t status;
 	clock_t wait_time;
 	boolean_t log_refcounts;
+	zone_persist_t *zp;
 
 	if (secpolicy_zone_config(CRED()) != 0)
 		return (set_errno(EPERM));
@@ -5221,6 +5890,12 @@ zone_destroy(zoneid_t zoneid)
 	zone_hold(zone);
 	mutex_exit(&zonehash_lock);
 
+	zp = &zone_pdata[zoneid];
+	mutex_enter(&zp->zpers_zfs_lock);
+	kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+	zp->zpers_zfsp = NULL;
+	mutex_exit(&zp->zpers_zfs_lock);
+
 	/*
 	 * wait for zsched to exit
 	 */
@@ -5610,14 +6285,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		size = sizeof (zone->zone_phys_mcap);
-		if (bufsize > size)
-			bufsize = size;
-		if (buf != NULL &&
-		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-			error = EFAULT;
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		mutex_enter(&class_lock);
 
@@ -5681,6 +6348,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		}
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_DID:
+		size = sizeof (zoneid_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		size = sizeof (boolean_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+		    bufsize) != 0)
+			error = EFAULT;
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -5712,10 +6396,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-	 * global zone.
+	 * No attributes can be set on the global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+	if (zoneid == GLOBAL_ZONEID) {
 		return (set_errno(EINVAL));
 	}
 
@@ -5728,11 +6411,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	mutex_exit(&zonehash_lock);
 
 	/*
-	 * At present most attributes can only be set on non-running,
+	 * At present attributes can only be set on non-running,
 	 * non-global zones.
 	 */
 	zone_status = zone_status_get(zone);
-	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+	if (zone_status > ZONE_IS_READY) {
 		err = EINVAL;
 		goto done;
 	}
@@ -5765,9 +6448,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_SECFLAGS:
 		err = zone_set_secflags(zone, (psecflags_t *)buf);
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		err = zone_set_sched_class(zone, (const char *)buf);
 		break;
@@ -5795,6 +6475,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		err = zone_set_network(zoneid, zbuf);
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_APP_SVC_CT:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_setup_app_contract = (boolean_t)buf;
+			err = 0;
+		}
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_fixed_hipri = (boolean_t)buf;
+			err = 0;
+		}
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6493,6 +7189,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 			zs.doi = zs32.doi;
 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
 			zs.flags = zs32.flags;
+			zs.zoneid = zs32.zoneid;
 #else
 			panic("get_udatamodel() returned bogus result\n");
 #endif
@@ -6503,7 +7200,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
 		    zs.extended_error, zs.match, zs.doi,
-		    zs.label, zs.flags));
+		    zs.label, zs.flags, zs.zoneid));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
 	case ZONE_DESTROY:
@@ -6604,6 +7301,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
 	bcopy(zone->zone_name, zone_name, zone_namelen);
 	zoneid = zone->zone_id;
 	uniqid = zone->zone_uniqid;
+	arg.status = zone->zone_init_status;
 	/*
 	 * zoneadmd may be down, but at least we can empty out the zone.
 	 * We can ignore the return value of zone_empty() since we're called
@@ -6781,7 +7479,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
 	 * later.
 	 */
-	killall(zone->zone_id);
+	killall(zone->zone_id, B_FALSE);
 	/*
 	 * Now, create the thread to contact zoneadmd and do the rest of the
 	 * work.  This thread can't be created in our zone otherwise
@@ -6844,16 +7542,15 @@ zone_shutdown_global(void)
 }
 
 /*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
  * The 'write' parameter is set to 1 if the dataset is also writable.
  */
 int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
 {
 	static int zfstype = -1;
 	zone_dataset_t *zd;
 	size_t len;
-	zone_t *zone = curproc->p_zone;
 	const char *name = NULL;
 	vfs_t *vfsp = NULL;
 
@@ -6921,7 +7618,8 @@ zone_dataset_visible(const char *dataset, int *write)
 	vfs_list_read_lock();
 	vfsp = zone->zone_vfslist;
 	do {
-		ASSERT(vfsp);
+		if (vfsp == NULL)
+			break;
 		if (vfsp->vfs_fstype == zfstype) {
 			name = refstr_value(vfsp->vfs_resource);
 
@@ -6958,6 +7656,18 @@ zone_dataset_visible(const char *dataset, int *write)
 }
 
 /*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	zone_t *zone = curproc->p_zone;
+
+	return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
  * zone_find_by_any_path() -
  *
  * kernel-private routine similar to zone_find_by_path(), but which
@@ -7059,6 +7769,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
 	zone_t *zone;
 	zone_t *thiszone;
 
+	/*
+	 * Only the GZ may add a datalink to a zone's list.
+	 */
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (set_errno(EPERM));
+
+	/*
+	 * Only a process with the datalink config priv may add a
+	 * datalink to a zone's list.
+	 */
+	if (secpolicy_dl_config(CRED()) != 0)
+		return (set_errno(EPERM));
+
+	/*
+	 * When links exist in the GZ, they aren't added to the GZ's
+	 * zone_dl_list. We must enforce this because link_activate()
+	 * depends on zone_check_datalink() returning only NGZs.
+	 */
+	if (zoneid == GLOBAL_ZONEID)
+		return (set_errno(EINVAL));
+
 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
 		return (set_errno(ENXIO));
 
@@ -7091,6 +7822,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
 	zone_t *zone;
 	int err = 0;
 
+	/*
+	 * Only the GZ may remove a datalink from a zone's list.
+	 */
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (set_errno(EPERM));
+
+	/*
+	 * Only a process with the datalink config priv may remove a
+	 * datalink from a zone's list.
+	 */
+	if (secpolicy_dl_config(CRED()) != 0)
+		return (set_errno(EPERM));
+
+	/*
+	 * If we can't add a datalink to the GZ's zone_dl_list then we
+	 * certainly can't remove them either.
+	 */
+	if (zoneid == GLOBAL_ZONEID)
+		return (set_errno(EINVAL));
+
 	if ((zone = zone_find_by_id(zoneid)) == NULL)
 		return (set_errno(EINVAL));
 
@@ -7108,25 +7859,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
 }
 
 /*
- * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
- * the linkid.  Otherwise we just check if the specified zoneidp has been
- * assigned the supplied linkid.
+ *
+ * This function may be used in two ways:
+ *
+ * 1. to get the zoneid of the zone this link is under, or
+ *
+ * 2. to verify that the link is under a specific zone.
+ *
+ * The first use is achieved by passing a zoneid of ALL_ZONES. The
+ * function then iterates the datalink list of every zone on the
+ * system until it finds the linkid. If the linkid is found then the
+ * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
+ * returned and zoneidp is not modified. The use of ALL_ZONES is
+ * limited to callers in the GZ to prevent leaking information to
+ * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
+ * to the second type in the list above.
+ *
+ * The second use is achieved by passing a specific zoneid. The GZ can
+ * use this to verify a link is under a particular zone. An NGZ can
+ * use this to verify a link is under itself. But an NGZ cannot use
+ * this to determine if a link is under some other zone as that would
+ * result in information leakage. If the link exists under the zone
+ * then 0 is returned. Otherwise, ENXIO is returned.
  */
 int
 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
 {
 	zone_t *zone;
+	zoneid_t zoneid = *zoneidp;
+	zoneid_t caller = getzoneid();
 	int err = ENXIO;
 
-	if (*zoneidp != ALL_ZONES) {
-		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
-			if (zone_dl_exists(zone, linkid))
+	/*
+	 * Only the GZ may enquire about all zones; an NGZ may only
+	 * enuqire about itself.
+	 */
+	if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
+		zoneid = caller;
+
+	if (zoneid != caller && caller != GLOBAL_ZONEID)
+		return (err);
+
+	if (zoneid != ALL_ZONES) {
+		if ((zone = zone_find_by_id(zoneid)) != NULL) {
+			if (zone_dl_exists(zone, linkid)) {
+				/*
+				 * We need to set this in case an NGZ
+				 * passes ALL_ZONES.
+				 */
+				*zoneidp = zoneid;
 				err = 0;
+			}
 			zone_rele(zone);
 		}
 		return (err);
 	}
 
+	ASSERT(caller == GLOBAL_ZONEID);
 	mutex_enter(&zonehash_lock);
 	for (zone = list_head(&zone_active); zone != NULL;
 	    zone = list_next(&zone_active, zone)) {
@@ -7137,6 +7926,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
 		}
 	}
 	mutex_exit(&zonehash_lock);
+
 	return (err);
 }
 
@@ -7157,6 +7947,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
 	zone_dl_t *zdl;
 	datalink_id_t *idptr = idarray;
 
+	/*
+	 * Only the GZ or the owning zone may look at the datalink list.
+	 */
+	if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
+		return (set_errno(EPERM));
+
 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
 		return (set_errno(EFAULT));
 	if ((zone = zone_find_by_id(zoneid)) == NULL)
@@ -7182,6 +7978,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
 	mutex_exit(&zone->zone_lock);
 	zone_rele(zone);
 
+	/*
+	 * Prevent returning negative nump values -- we should never
+	 * have this many links anyways.
+	 */
+	if (num > INT_MAX)
+		return (set_errno(EOVERFLOW));
+
 	/* Increased or decreased, caller should be notified. */
 	if (num != dlcount) {
 		if (copyout(&num, nump, sizeof (num)) != 0)
@@ -7395,3 +8198,231 @@ done:
 	else
 		return (0);
 }
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+	zone_persist_t *zp = &zone_pdata[zid];
+
+	/* See if over (unlimited is UINT32_MAX), or already marked that way. */
+	if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
+		return;
+	}
+
+	mutex_enter(&zone_physcap_lock);
+	/* Recheck setting under mutex */
+	if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+		zp->zpers_over = 1;
+		zp->zpers_nover++;
+		zone_num_over_cap++;
+		DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+	}
+	mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ *   cap    pages     pages     1% shift7  shift7
+ *  128M    32768 0x0008000    327    256 0x00100
+ *  512M   131072 0x0020000   1310   1024 0x00400
+ *    1G   262144 0x0040000   2621   2048 0x00800
+ *    4G  1048576 0x0100000  10485   8192 0x02000
+ *    8G  2097152 0x0200000  20971  16384 0x04000
+ *   16G  4194304 0x0400000  41943  32768 0x08000
+ *   32G  8388608 0x0800000  83886  65536 0x10000
+ *   64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+	zone_persist_t *zp = &zone_pdata[zid];
+	uint32_t adjusted_limit;
+
+	/*
+	 * See if under, or already marked that way. There is no need to
+	 * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+	 * since we'll never set zpers_over in zone_incr_capped().
+	 */
+	if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
+		return;
+	}
+
+	adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
+
+	/* Recheck, accounting for our hysteresis. */
+	if (zp->zpers_pg_cnt >= adjusted_limit) {
+		return;
+	}
+
+	mutex_enter(&zone_physcap_lock);
+	/* Recheck under mutex. */
+	if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+		zp->zpers_over = 0;
+		ASSERT(zone_num_over_cap > 0);
+		zone_num_over_cap--;
+		DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+	}
+	mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+	uint_t pcnt;
+	zone_persist_t *zp;
+	zoneid_t zid;
+
+	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
+	if (PP_ISKAS(pp))
+		return;
+
+	ASSERT(!PP_ISFREE(pp));
+
+	zid = curzone->zone_id;
+	if (pp->p_zoneid == zid) {
+		/* Another mapping to this page for this zone, do nothing */
+		return;
+	}
+
+	if (pp->p_szc == 0) {
+		pcnt = 1;
+	} else {
+		/* large page */
+		pcnt = page_get_pagecnt(pp->p_szc);
+	}
+
+	if (pp->p_share == 0) {
+		/* First mapping to this page. */
+		pp->p_zoneid = zid;
+		zp = &zone_pdata[zid];
+		ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
+		zone_incr_capped(zid);
+		return;
+	}
+
+	if (pp->p_zoneid != ALL_ZONES) {
+		/*
+		 * The page is now being shared across a different zone.
+		 * Decrement the original zone's usage.
+		 */
+		zid = pp->p_zoneid;
+		pp->p_zoneid = ALL_ZONES;
+		ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+		zp = &zone_pdata[zid];
+
+		if (zp->zpers_pg_cnt > 0) {
+			atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+		}
+		zone_decr_capped(zid);
+	}
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+	uint_t pcnt;
+	zone_persist_t *zp;
+	zoneid_t zid;
+
+	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
+	if (PP_ISKAS(pp))
+		return;
+
+	zid = pp->p_zoneid;
+	if (zid == ALL_ZONES || pp->p_share != 0)
+		return;
+
+	/* This is the last mapping to the page for a zone. */
+	if (pp->p_szc == 0) {
+		pcnt = 1;
+	} else {
+		/* large page */
+		pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+	}
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+	if (zp->zpers_pg_cnt > 0) {
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+	}
+	zone_decr_capped(zid);
+	pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+	zone_persist_t *zp;
+
+	if (zid == ALL_ZONES)
+		return;
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+
+#ifndef DEBUG
+	atomic_add_64(&zp->zpers_pg_out, 1);
+#else
+	switch (op) {
+	case ZPO_DIRTY:
+		atomic_add_64(&zp->zpers_pg_fsdirty, 1);
+		break;
+	case ZPO_FS:
+		atomic_add_64(&zp->zpers_pg_fs, 1);
+		break;
+	case ZPO_ANON:
+		atomic_add_64(&zp->zpers_pg_anon, 1);
+		break;
+	case ZPO_ANONDIRTY:
+		atomic_add_64(&zp->zpers_pg_anondirty, 1);
+		break;
+	default:
+		cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+		break;
+	}
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+	zone_persist_t *zp;
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+
+	/*
+	 * If memory or swap limits are set on the zone, use those, otherwise
+	 * use the system values. physmem and freemem are also in pages.
+	 */
+	if (zp->zpers_pg_limit == UINT32_MAX) {
+		*memcap = physmem;
+		*free = freemem;
+	} else {
+		int64_t freemem;
+
+		*memcap = (pgcnt_t)zp->zpers_pg_limit;
+		freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
+		if (freemem > 0) {
+			*free = (pgcnt_t)freemem;
+		} else {
+			*free = (pgcnt_t)0;
+		}
+	}
+}