45 files changed, 3484 insertions, 1046 deletions
diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c
index e598e0d08d..891c4e0836 100644
--- a/usr/src/uts/common/os/acct.c
+++ b/usr/src/uts/common/os/acct.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -47,6 +48,7 @@
 #include <sys/time.h>
 #include <sys/msacct.h>
 #include <sys/zone.h>
+#include <sys/brand.h>
 
 /*
  * Each zone has its own accounting settings (on or off) and associated
@@ -373,7 +375,7 @@ acct_compress(ulong_t t)
  * On exit, write a record on the accounting file.
  */
 void
-acct(char st)
+acct(int st)
 {
 	struct vnode *vp;
 	struct cred *cr;
@@ -402,6 +404,21 @@ acct(char st)
 	 * This only gets called from exit after all lwp's have exited so no
 	 * cred locking is needed.
 	 */
+
+	/* If there is a brand-specific hook, use it instead */
+	if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) {
+		ZBROP(curzone)->b_acct_out(vp, st);
+		mutex_exit(&ag->aclock);
+		return;
+	}
+
+	/*
+	 * The 'st' status value was traditionally masked this way by our
+	 * caller, but we now accept the unmasked value for brand handling.
+	 * Zones not using the brand hook mask the status here.
+	 */
+	st &= 0xff;
+
 	p = curproc;
 	ua = PTOU(p);
 	bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm));
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index 0af67f5d98..60e8150a0d 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {
 };
 #else /* !__sparcv9 */
 struct brand_mach_ops native_mach_ops  = {
-		NULL, NULL, NULL, NULL
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL
 };
 #endif /* !__sparcv9 */
 
@@ -53,7 +54,8 @@ brand_t native_brand = {
 		BRAND_VER_1,
 		"native",
 		NULL,
-		&native_mach_ops
+		&native_mach_ops,
+		0
 };
 
 /*
@@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp)
 	mutex_exit(&brand_list_lock);
 }
 
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
+	void *brand_data = NULL;
 
-	ASSERT(bp != NULL);
-	ASSERT(p->p_brand == &native_brand);
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
 
 	/*
-	 * We should only be called from exec(), when we know the process
-	 * is single-threaded.
+	 * Process branding occurs during fork() and exec().  When it happens
+	 * during fork(), the LWP count will always be 0 since branding is
+	 * performed as part of getproc(), before LWPs have been associated.
+	 * The same is not true during exec(), where a multi-LWP process may
+	 * undergo branding just prior to gexec(). This is to ensure
+	 * exec-related brand hooks are available.  While it may seem
+	 * complicated to brand a multi-LWP process, the two possible outcomes
+	 * simplify things:
+	 *
+	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and
+	 *    any further branding will occur in a single-LWP context.
+	 * 2. The exec() fails: The process will be promptly unbranded since
+	 *    the hooks are no longer needed.
+	 *
+	 * To prevent inconsistent brand state from being encountered during
+	 * the exec(), LWPs beyond the caller which are associated with this
+	 * process must be held temporarily.  They will be released either when
+	 * they are killed in the exec() success, or when the brand is cleared
+	 * after exec() failure.
 	 */
-	ASSERT(p->p_tlist == p->p_tlist->t_forw);
+	if (lwps_ok) {
+		/*
+		 * We've been called from a exec() context tolerating the
+		 * existence of multiple LWPs during branding is necessary.
+		 */
+		VERIFY(p == curproc);
+		VERIFY(p->p_tlist != NULL);
 
+		if (p->p_tlist != p->p_tlist->t_forw) {
+			/*
+			 * Multiple LWPs are present.  Hold all but the caller.
+			 */
+			if (!holdlwps(SHOLDFORK1)) {
+				return (-1);
+			}
+		}
+	} else {
+		/*
+		 * Processes branded during fork() should not have LWPs at all.
+		 */
+		VERIFY(p->p_tlist == NULL);
+	}
+
+	if (bp->b_data_size > 0) {
+		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+	}
+
+	mutex_enter(&p->p_lock);
+	ASSERT(!PROC_IS_BRANDED(p));
 	p->p_brand = bp;
+	p->p_brand_data = brand_data;
 	ASSERT(PROC_IS_BRANDED(p));
 	BROP(p)->b_setbrand(p);
+	mutex_exit(&p->p_lock);
+	return (0);
 }
 
 void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
-	klwp_t *lwp = NULL;
-	ASSERT(bp != NULL);
-	ASSERT(!no_lwps || (p->p_tlist == NULL));
+	void *brand_data;
 
-	/*
-	 * If called from exec_common() or proc_exit(),
-	 * we know the process is single-threaded.
-	 * If called from fork_fail, p_tlist is NULL.
-	 */
-	if (!no_lwps) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		lwp = p->p_tlist->t_lwp;
-	}
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
+	VERIFY(PROC_IS_BRANDED(p));
 
-	ASSERT(PROC_IS_BRANDED(p));
-	BROP(p)->b_proc_exit(p, lwp);
+	if (BROP(p)->b_clearbrand != NULL)
+		BROP(p)->b_clearbrand(p, lwps_ok);
+
+	mutex_enter(&p->p_lock);
 	p->p_brand = &native_brand;
+	brand_data = p->p_brand_data;
+	p->p_brand_data = NULL;
+
+	if (lwps_ok) {
+		VERIFY(p == curproc);
+		/*
+		 * A process with multiple LWPs is being de-branded after
+		 * failing an exec.  The other LWPs were held as part of the
+		 * procedure, so they must be resumed now.
+		 */
+		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+			continuelwps(p);
+		}
+	} else {
+		/*
+		 * While clearing the brand, it's ok for one LWP to be present.
+		 * This happens when a native binary is executed inside a
+		 * branded zone, since the brand will be removed during the
+		 * course of a successful exec.
+		 */
+		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+	}
+	mutex_exit(&p->p_lock);
+
+	if (brand_data != NULL) {
+		kmem_free(brand_data, bp->b_data_size);
+	}
 }
 
 #if defined(__sparcv9)
@@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
 		return (ENOSYS);
 
 	/* For all other operations this must be a branded process. */
-	if (p->p_brand == &native_brand)
+	if (!PROC_IS_BRANDED(p))
 		return (ENOSYS);
 
 	ASSERT(p->p_brand == pbrand);
@@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
 /*ARGSUSED*/
 int
 brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
-    intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
-    cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
-    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+    intpdata_t *idatap, int level, size_t *execsz, int setid,
+    caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand,
+    char *bname, char *brandlib, char *brandlib32)
 {
 
 	vnode_t		*nvp;
 	Ehdr		ehdr;
 	Addr		uphdr_vaddr;
 	intptr_t	voffset;
-	int		interp;
+	char		*interp;
 	int		i, err;
 	struct execenv	env;
 	struct execenv	origenv;
@@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	klwp_t		*lwp = ttolwp(curthread);
 	brand_proc_data_t	*spd;
 	brand_elf_data_t sed, *sedp;
-	char		*linker;
 	uintptr_t	lddata; /* lddata of executable's linker */
 
 	ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	 */
 	if (args->to_model == DATAMODEL_NATIVE) {
 		args->emulator = brandlib;
-		linker = brandlinker;
 	}
 #if defined(_LP64)
 	else {
 		args->emulator = brandlib32;
-		linker = brandlinker32;
 	}
 #endif  /* _LP64 */
 
@@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	if (args->to_model == DATAMODEL_NATIVE) {
 		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 	}
 #if defined(_LP64)
 	else {
@@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		Elf32_Addr uphdr_vaddr32;
 		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 		Ehdr32to64(&ehdr32, &ehdr);
 
 		if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 #endif  /* _LP64 */
 	if (err != 0) {
 		restoreexecenv(&origenv, &orig_sigaltstack);
+
+		if (interp != NULL)
+			kmem_free(interp, MAXPATHLEN);
+
 		return (err);
 	}
 
@@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	sedp->sed_phent = ehdr.e_phentsize;
 	sedp->sed_phnum = ehdr.e_phnum;
 
-	if (interp) {
+	if (interp != NULL) {
 		if (ehdr.e_type == ET_DYN) {
 			/*
 			 * This is a shared object executable, so we
@@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		 * it in and store relevant information about it in the
 		 * aux vector, where the brand library can find it.
 		 */
-		if ((err = lookupname(linker, UIO_SYSSPACE,
+		if ((err = lookupname(interp, UIO_SYSSPACE,
 		    FOLLOW, NULLVPP, &nvp)) != 0) {
-			uprintf("%s: not found.", brandlinker);
+			uprintf("%s: not found.", interp);
 			restoreexecenv(&origenv, &orig_sigaltstack);
+			kmem_free(interp, MAXPATHLEN);
 			return (err);
 		}
+
+		kmem_free(interp, MAXPATHLEN);
+
 		if (args->to_model == DATAMODEL_NATIVE) {
 			err = mapexec_brand(nvp, args, &ehdr,
 			    &uphdr_vaddr, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 		}
 #if defined(_LP64)
 		else {
@@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			Elf32_Addr uphdr_vaddr32;
 			err = mapexec32_brand(nvp, args, &ehdr32,
 			    &uphdr_vaddr32, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 			Ehdr32to64(&ehdr32, &ehdr);
 
 			if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 
 	/*
 	 * Third, the /proc aux vectors set up by elfexec() point to
-	 * brand emulation library and it's linker.  Copy these to the
+	 * brand emulation library and its linker.  Copy these to the
 	 * /proc brand specific aux vector, and update the regular
-	 * /proc aux vectors to point to the executable (and it's
+	 * /proc aux vectors to point to the executable (and its
 	 * linker).  This will enable debuggers to access the
 	 * executable via the usual /proc or elf notes aux vectors.
 	 *
@@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
 }
 
 /*ARGSUSED*/
-int
+void
 brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
 {
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand == NULL);
 	l->lwp_brand = (void *)-1;
-	return (0);
 }
 
 /*ARGSUSED*/
 void
 brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
 {
-	proc_t  *p = l->lwp_procp;
-
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand != NULL);
-
-	/*
-	 * We should never be called for the last thread in a process.
-	 * (That case is handled by brand_solaris_proc_exit().)
-	 * Therefore this lwp must be exiting from a multi-threaded
-	 * process.
-	 */
-	ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
-	l->lwp_brand = NULL;
 }
 
 /*ARGSUSED*/
 void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
 {
 	ASSERT(p->p_brand == pbrand);
 	ASSERT(p->p_brand_data != NULL);
 
-	/*
-	 * When called from proc_exit(), we know that process is
-	 * single-threaded and free our lwp brand data.
-	 * otherwise just free p_brand_data and return.
-	 */
-	if (l != NULL) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		ASSERT(p->p_tlist->t_lwp == l);
-		(void) brand_solaris_freelwp(l, pbrand);
-	}
-
 	/* upon exit, free our proc brand data */
 	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
 	p->p_brand_data = NULL;
@@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
 	ASSERT(p->p_tlist == p->p_tlist->t_forw);
 
 	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
-	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
 }
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 805813037d..1280c8a1b6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
  */
 
 #include <sys/timer.h>
@@ -41,6 +41,9 @@
 
 static clock_backend_t clock_highres;
 
+/* minimum non-privileged interval (200us) */
+long clock_highres_interval_min = 200000;
+
 /*ARGSUSED*/
 static int
 clock_highres_settime(timespec_t *ts)
@@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts)
 static int
 clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *))
 {
-	/*
-	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny
-	 * service; only allow privileged users to create such timers.
-	 * Sites that do not wish to have this restriction should
-	 * give users the "proc_clock_highres" privilege.
-	 */
-	if (secpolicy_clock_highres(CRED()) != 0) {
-		it->it_arg = NULL;
-		return (EPERM);
-	}
-
 	it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP);
 	it->it_fire = fire;
 
@@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags,
 	cpu_t *cpu;
 	cpupart_t *cpupart;
 	int pset;
+	boolean_t value_need_clamp = B_FALSE;
+	boolean_t intval_need_clamp = B_FALSE;
+	cred_t *cr = CRED();
+	struct itimerspec clamped;
+
+	/*
+	 * CLOCK_HIGHRES timers of sufficiently high resolution can deny
+	 * service; only allow privileged users to create such timers.
+	 * Non-privileged users (those without the "proc_clock_highres"
+	 * privilege) can create timers with lower resolution but if they
+	 * attempt to use a very low time value (< 200us) then their
+	 * timer will be clamped at 200us.
+	 */
+	if (when->it_value.tv_sec == 0 &&
+	    when->it_value.tv_nsec > 0 &&
+	    when->it_value.tv_nsec < clock_highres_interval_min)
+		value_need_clamp = B_TRUE;
+
+	if (when->it_interval.tv_sec == 0 &&
+	    when->it_interval.tv_nsec > 0 &&
+	    when->it_interval.tv_nsec < clock_highres_interval_min)
+		intval_need_clamp = B_TRUE;
+
+	if ((value_need_clamp || intval_need_clamp) &&
+	    secpolicy_clock_highres(cr) != 0) {
+		clamped.it_value.tv_sec = when->it_value.tv_sec;
+		clamped.it_interval.tv_sec = when->it_interval.tv_sec;
+
+		if (value_need_clamp) {
+			clamped.it_value.tv_nsec = clock_highres_interval_min;
+		} else {
+			clamped.it_value.tv_nsec = when->it_value.tv_nsec;
+		}
+
+		if (intval_need_clamp) {
+			clamped.it_interval.tv_nsec =
+			    clock_highres_interval_min;
+		} else {
+			clamped.it_interval.tv_nsec = when->it_interval.tv_nsec;
+		}
+
+		when = &clamped;
+	}
 
 	cyctime.cyt_when = ts2hrt(&when->it_value);
 	cyctime.cyt_interval = ts2hrt(&when->it_interval);
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index 909a6c2860..1a3502a710 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 /*
  * Copyright (c) 2017 by Delphix. All rights reserved.
@@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
 	avl_index_t where;
 	klwp_t *curlwp = ttolwp(curthread);
 
-	ASSERT(author == curproc);
+	/*
+	 * It's possible that author is not curproc if the zone is creating
+	 * a new process as a child of zsched.
+	 */
 
 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d5e272c16a..a147b1cf0f 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2019 Joyent Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
@@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type)
 	/*
 	 * Determine what rootvp to use.
 	 */
+	mutex_enter(&curproc->p_lock);
 	if (core_type == CORE_PROC) {
 		rootvp = (PTOU(curproc)->u_rdir == NULL ?
 		    curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir);
@@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type)
 	VN_HOLD(startvp);
 	if (rootvp != rootdir)
 		VN_HOLD(rootvp);
+	mutex_exit(&curproc->p_lock);
 	if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp,
 	    startvp, CRED())) != 0) {
 		pn_free(&pn);
@@ -793,7 +795,7 @@ clock_t	core_delay_usec = 10000;
  * using core_write() below, and so it has the same failure semantics.
  */
 int
-core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
+core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size,
     rlim64_t rlimit, cred_t *credp)
 {
 	caddr_t eaddr;
@@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
 	size_t len;
 	int err = 0;
 
+	if (offset > OFF_MAX || offset + size > OFF_MAX ||
+	    offset + size < offset) {
+		return (EOVERFLOW);
+	}
+
 	eaddr = addr + size;
 	for (base = addr; base < eaddr; base += len) {
 		len = eaddr - base;
@@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size,
  * unexpectedly returns zero but no progress has been made, we return ENOSPC.
  */
 int
-core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset,
+core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset,
     const void *buf, size_t len, rlim64_t rlimit, cred_t *credp)
 {
 	ssize_t resid = len;
 	int error = 0;
 
+	if (offset > OFF_MAX || offset + len > OFF_MAX ||
+	    offset + len < offset) {
+		return (EOVERFLOW);
+	}
+
 	while (len != 0) {
-		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset,
-		    segflg, 0, rlimit, credp, &resid);
+		error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len,
+		    (offset_t)offset, segflg, 0, rlimit, credp, &resid);
 
 		if (error != 0)
 			break;
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 87c0896814..4648dae9dd 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -108,7 +109,8 @@ kmutex_t	cpu_lock;
 cpu_t		*cpu_list;		/* list of all CPUs */
 cpu_t		*clock_cpu_list;	/* used by clock to walk CPUs */
 cpu_t		*cpu_active;		/* list of active CPUs */
-static cpuset_t	cpu_available;		/* set of available CPUs */
+cpuset_t	cpu_active_set;		/* cached set of active CPUs */
+cpuset_t	cpu_available;		/* set of available CPUs */
 cpuset_t	cpu_seqid_inuse;	/* which cpu_seqids are in use */
 
 cpu_t		**cpu_seq;		/* ptrs to CPUs, indexed by seq_id */
@@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp)
 
 /*
  * Set affinity for a specified CPU.
- * A reference count is incremented and the affinity is held until the
- * reference count is decremented to zero by thread_affinity_clear().
- * This is so regions of code requiring affinity can be nested.
- * Caller needs to ensure that cpu_id remains valid, which can be
- * done by holding cpu_lock across this call, unless the caller
- * specifies CPU_CURRENT in which case the cpu_lock will be acquired
- * by thread_affinity_set and CPU->cpu_id will be the target CPU.
+ *
+ * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for
+ * curthread, will set affinity to the CPU on which the thread is currently
+ * running.  For other cpu_id values, the caller must ensure that the
+ * referenced CPU remains valid, which can be done by holding cpu_lock across
+ * this call.
+ *
+ * CPU affinity is guaranteed after return of thread_affinity_set().  If a
+ * caller setting affinity to CPU_CURRENT requires that its thread not migrate
+ * CPUs prior to a successful return, it should take extra precautions (such as
+ * their own call to kpreempt_disable) to ensure that safety.
+ *
+ * CPU_BEST can be used to pick a "best" CPU to migrate to, including
+ * potentially the current CPU.
+ *
+ * A CPU affinity reference count is maintained by thread_affinity_set and
+ * thread_affinity_clear (incrementing and decrementing it, respectively),
+ * maintaining CPU affinity while the count is non-zero, and allowing regions
+ * of code which require affinity to be nested.
  */
 void
 thread_affinity_set(kthread_id_t t, int cpu_id)
 {
-	cpu_t		*cp;
-	int		c;
+	cpu_t *cp;
 
 	ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL));
 
-	if ((c = cpu_id) == CPU_CURRENT) {
-		mutex_enter(&cpu_lock);
-		cpu_id = CPU->cpu_id;
+	if (cpu_id == CPU_CURRENT) {
+		VERIFY3P(t, ==, curthread);
+		kpreempt_disable();
+		cp = CPU;
+	} else if (cpu_id == CPU_BEST) {
+		VERIFY3P(t, ==, curthread);
+		kpreempt_disable();
+		cp = disp_choose_best_cpu();
+	} else {
+		/*
+		 * We should be asserting that cpu_lock is held here, but
+		 * the NCA code doesn't acquire it.  The following assert
+		 * should be uncommented when the NCA code is fixed.
+		 *
+		 * ASSERT(MUTEX_HELD(&cpu_lock));
+		 */
+		VERIFY((cpu_id >= 0) && (cpu_id < NCPU));
+		cp = cpu[cpu_id];
+
+		/* user must provide a good cpu_id */
+		VERIFY(cp != NULL);
 	}
-	/*
-	 * We should be asserting that cpu_lock is held here, but
-	 * the NCA code doesn't acquire it.  The following assert
-	 * should be uncommented when the NCA code is fixed.
-	 *
-	 * ASSERT(MUTEX_HELD(&cpu_lock));
-	 */
-	ASSERT((cpu_id >= 0) && (cpu_id < NCPU));
-	cp = cpu[cpu_id];
-	ASSERT(cp != NULL);		/* user must provide a good cpu_id */
+
 	/*
 	 * If there is already a hard affinity requested, and this affinity
 	 * conflicts with that, panic.
@@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id)
 	 * Make sure we're running on the right CPU.
 	 */
 	if (cp != t->t_cpu || t != curthread) {
+		ASSERT(cpu_id != CPU_CURRENT);
 		force_thread_migrate(t);	/* drops thread lock */
 	} else {
 		thread_unlock(t);
 	}
 
-	if (c == CPU_CURRENT)
-		mutex_exit(&cpu_lock);
+	if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST)
+		kpreempt_enable();
 }
 
 /*
@@ -1473,8 +1496,8 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 				 * Update CPU last ran on if it was this CPU
 				 */
 				if (t->t_cpu == cp && t->t_bound_cpu != cp)
-					t->t_cpu = disp_lowpri_cpu(ncp,
-					    t->t_lpl, t->t_pri, NULL);
+					t->t_cpu = disp_lowpri_cpu(ncp, t,
+					    t->t_pri);
 				ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 				    t->t_weakbound_cpu == cp);
 
@@ -1516,10 +1539,9 @@ again:	for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) {
 			 * Update CPU last ran on if it was this CPU
 			 */
 
-			if (t->t_cpu == cp && t->t_bound_cpu != cp) {
-				t->t_cpu = disp_lowpri_cpu(ncp,
-				    t->t_lpl, t->t_pri, NULL);
-			}
+			if (t->t_cpu == cp && t->t_bound_cpu != cp)
+				t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri);
+
 			ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp ||
 			    t->t_weakbound_cpu == cp);
 			t = t->t_next;
@@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp)
 	cp->cpu_part = &cp_default;
 
 	CPUSET_ADD(cpu_available, cp->cpu_id);
+	CPUSET_ADD(cpu_active_set, cp->cpu_id);
 }
 
 /*
@@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp)
 	cp->cpu_prev_onln = cpu_active->cpu_prev_onln;
 	cpu_active->cpu_prev_onln->cpu_next_onln = cp;
 	cpu_active->cpu_prev_onln = cp;
+	CPUSET_ADD(cpu_active_set, cp->cpu_id);
 
 	if (pp->cp_cpulist) {
 		cp->cpu_next_part = pp->cp_cpulist;
@@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp)
 	}
 	cp->cpu_next_onln = cp;
 	cp->cpu_prev_onln = cp;
+	CPUSET_DEL(cpu_active_set, cp->cpu_id);
 
 	cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
 	cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
@@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind,
 	return (0);
 }
 
-#if CPUSET_WORDS > 1
 
-/*
- * Functions for implementing cpuset operations when a cpuset is more
- * than one word.  On platforms where a cpuset is a single word these
- * are implemented as macros in cpuvar.h.
- */
+cpuset_t *
+cpuset_alloc(int kmflags)
+{
+	return (kmem_alloc(sizeof (cpuset_t), kmflags));
+}
+
+void
+cpuset_free(cpuset_t *s)
+{
+	kmem_free(s, sizeof (cpuset_t));
+}
 
 void
 cpuset_all(cpuset_t *s)
@@ -2722,38 +2752,61 @@ cpuset_all(cpuset_t *s)
 }
 
 void
-cpuset_all_but(cpuset_t *s, uint_t cpu)
+cpuset_all_but(cpuset_t *s, const uint_t cpu)
 {
 	cpuset_all(s);
 	CPUSET_DEL(*s, cpu);
 }
 
 void
-cpuset_only(cpuset_t *s, uint_t cpu)
+cpuset_only(cpuset_t *s, const uint_t cpu)
 {
 	CPUSET_ZERO(*s);
 	CPUSET_ADD(*s, cpu);
 }
 
+long
+cpu_in_set(cpuset_t *s, const uint_t cpu)
+{
+	VERIFY(cpu < NCPU);
+	return (BT_TEST(s->cpub, cpu));
+}
+
+void
+cpuset_add(cpuset_t *s, const uint_t cpu)
+{
+	VERIFY(cpu < NCPU);
+	BT_SET(s->cpub, cpu);
+}
+
+void
+cpuset_del(cpuset_t *s, const uint_t cpu)
+{
+	VERIFY(cpu < NCPU);
+	BT_CLEAR(s->cpub, cpu);
+}
+
 int
 cpuset_isnull(cpuset_t *s)
 {
 	int i;
 
-	for (i = 0; i < CPUSET_WORDS; i++)
+	for (i = 0; i < CPUSET_WORDS; i++) {
 		if (s->cpub[i] != 0)
 			return (0);
+	}
 	return (1);
 }
 
 int
-cpuset_cmp(cpuset_t *s1, cpuset_t *s2)
+cpuset_isequal(cpuset_t *s1, cpuset_t *s2)
 {
 	int i;
 
-	for (i = 0; i < CPUSET_WORDS; i++)
+	for (i = 0; i < CPUSET_WORDS; i++) {
 		if (s1->cpub[i] != s2->cpub[i])
 			return (0);
+	}
 	return (1);
 }
 
@@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid)
 	*smallestid = *largestid = CPUSET_NOTINSET;
 }
 
-#endif	/* CPUSET_WORDS */
+void
+cpuset_atomic_del(cpuset_t *s, const uint_t cpu)
+{
+	VERIFY(cpu < NCPU);
+	BT_ATOMIC_CLEAR(s->cpub, (cpu))
+}
+
+void
+cpuset_atomic_add(cpuset_t *s, const uint_t cpu)
+{
+	VERIFY(cpu < NCPU);
+	BT_ATOMIC_SET(s->cpub, (cpu))
+}
+
+long
+cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu)
+{
+	long res;
+
+	VERIFY(cpu < NCPU);
+	BT_ATOMIC_SET_EXCL(s->cpub, cpu, res);
+	return (res);
+}
+
+long
+cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
+{
+	long res;
+
+	VERIFY(cpu < NCPU);
+	BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res);
+	return (res);
+}
+
+void
+cpuset_or(cpuset_t *dst, cpuset_t *src)
+{
+	for (int i = 0; i < CPUSET_WORDS; i++) {
+		dst->cpub[i] |= src->cpub[i];
+	}
+}
+
+void
+cpuset_xor(cpuset_t *dst, cpuset_t *src)
+{
+	for (int i = 0; i < CPUSET_WORDS; i++) {
+		dst->cpub[i] ^= src->cpub[i];
+	}
+}
+
+void
+cpuset_and(cpuset_t *dst, cpuset_t *src)
+{
+	for (int i = 0; i < CPUSET_WORDS; i++) {
+		dst->cpub[i] &= src->cpub[i];
+	}
+}
+
+void
+cpuset_zero(cpuset_t *dst)
+{
+	for (int i = 0; i < CPUSET_WORDS; i++) {
+		dst->cpub[i] = 0;
+	}
+}
+
 
 /*
  * Unbind threads bound to specified CPU.
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 25727d54c5..0bd6cfd44f 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr)
 	    cr->cr_zone->zone_id);
 }
 
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+	return (cr->cr_zone == NULL ?
+	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+	    cr->cr_zone->zone_did);
+}
+
 projid_t
 crgetprojid(const cred_t *cr)
 {
diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c
index 0aa54eeaee..316dffc326 100644
--- a/usr/src/uts/common/os/cyclic.c
+++ b/usr/src/uts/common/os/cyclic.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2018 Joyent Inc.
  */
 
 /*
@@ -112,6 +112,7 @@
  *      cyclic_remove()      <-- Removes a cyclic
  *      cyclic_bind()        <-- Change a cyclic's CPU or partition binding
  *      cyclic_reprogram()   <-- Reprogram a cyclic's expiration
+ *      cyclic_move_here()   <-- Shuffle cyclic to current CPU
  *
  *  Inter-subsystem Interfaces
  *
@@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
 	return (1);
 }
 
+/*
+ *  void cyclic_move_here(cyclic_id_t)
+ *
+ *  Overview
+ *
+ *    cyclic_move_here() attempts to shuffle a cyclic onto the current CPU.
+ *
+ *  Arguments and notes
+ *
+ *    The first argument is a cyclic_id returned from cyclic_add().
+ *    cyclic_move_here() may _not_ be called on a cyclic_id returned from
+ *    cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind().
+ *
+ *    This cyclic shuffling is performed on a best-effort basis.  If for some
+ *    reason the current CPU is unsuitable or the thread migrates between CPUs
+ *    during the call, the function may return with the cyclic residing on some
+ *    other CPU.
+ *
+ *  Return value
+ *
+ *    None; cyclic_move_here() always reports success.
+ *
+ *  Caller's context
+ *
+ *    cpu_lock must be held by the caller, and the caller must not be in
+ *    interrupt context.  The caller may not hold any locks which are also
+ *    grabbed by any cyclic handler.
+ */
+void
+cyclic_move_here(cyclic_id_t id)
+{
+	cyc_id_t *idp = (cyc_id_t *)id;
+	cyc_cpu_t *cc = idp->cyi_cpu;
+	cpu_t *dest = CPU;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	CYC_PTRACE("move_here", idp, dest);
+	VERIFY3P(cc, !=, NULL);
+	VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags &
+	    (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0);
+
+	if (cc->cyp_cpu == dest) {
+		return;
+	}
+
+	/* Is the destination CPU suitable for a migration target? */
+	if (dest->cpu_cyclic == NULL ||
+	    dest->cpu_cyclic->cyp_state == CYS_OFFLINE ||
+	    (dest->cpu_flags & CPU_ENABLE) == 0) {
+		return;
+	}
+
+	cyclic_juggle_one_to(idp, dest->cpu_cyclic);
+}
+
 hrtime_t
 cyclic_getres()
 {
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
 
 	/* Log callback errors */
 	if (ret != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
 		    ddi_driver_name(req_p->ireq_dip),
 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
 	}
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index f51e2c5ca1..24b6f0e2eb 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 /*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
 #endif
 
 #define	PSUIDFLAGS		(SNOCD|SUGID)
+#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */
 
 /*
  * These are consumed within the specific exec modules, but are defined here
@@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
 	struct user *up = PTOU(p);
-	long execsz;		/* temporary count of exec size */
+	size_t execsz;		/* temporary count of exec size */
 	int i;
 	int error;
 	char exec_file[MAXCOMLEN+1];
@@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * only if the pathname does not contain a "/" the resolved path
 	 * points to a file in the current working (attribute) directory.
 	 */
-	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
+	mutex_enter(&p->p_lock);
+	if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 &&
 	    strchr(resolvepn.pn_path, '/') == NULL) {
+		mutex_exit(&p->p_lock);
 		if (dir != NULL)
 			VN_RELE(dir);
 		error = EACCES;
@@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 		VN_RELE(vp);
 		goto out;
 	}
+	mutex_exit(&p->p_lock);
 
 	bzero(exec_file, MAXCOMLEN+1);
 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
@@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	ua.argp = argp;
 	ua.envp = envp;
 
-	/* If necessary, brand this process before we start the exec. */
-	if (brandme)
-		brand_setbrand(p);
+	/* If necessary, brand this process/lwp before we start the exec. */
+	if (brandme) {
+		void *brand_data = NULL;
+
+		/*
+		 * Process branding may fail if multiple LWPs are present and
+		 * holdlwps() cannot complete successfully.
+		 */
+		error = brand_setbrand(p, B_TRUE);
+
+		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+			brand_data = BROP(p)->b_lwpdata_alloc(p);
+			if (brand_data == NULL) {
+				error = 1;
+			}
+		}
+
+		if (error == 0) {
+			mutex_enter(&p->p_lock);
+			BROP(p)->b_initlwp(lwp, brand_data);
+			mutex_exit(&p->p_lock);
+		} else {
+			VN_RELE(vp);
+			if (dir != NULL) {
+				VN_RELE(dir);
+			}
+			pn_free(&resolvepn);
+			goto fail;
+		}
+	}
 
 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
-	    exec_file, p->p_cred, brand_action)) != 0) {
-		if (brandme)
-			brand_clearbrand(p, B_FALSE);
+	    exec_file, p->p_cred, &brand_action)) != 0) {
+		if (brandme) {
+			BROP(p)->b_freelwp(lwp);
+			brand_clearbrand(p, B_TRUE);
+		}
 		VN_RELE(vp);
 		if (dir != NULL)
 			VN_RELE(dir);
@@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	/*
 	 * Clear contract template state
 	 */
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_TRUE);
 
 	/*
 	 * Save the directory in which we found the executable for expanding
@@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * pending held signals remain held, so don't clear t_hold.
 	 */
 	mutex_enter(&p->p_lock);
+	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
 	lwp->lwp_oldcontext = 0;
 	lwp->lwp_ustack = 0;
 	lwp->lwp_old_stk_ctl = 0;
@@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 
 	/* Unbrand ourself if necessary. */
-	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+		BROP(p)->b_freelwp(lwp);
 		brand_clearbrand(p, B_FALSE);
+	}
 
 	setregs(&args);
 
@@ -566,10 +603,10 @@ gexec(
 	struct uarg *args,
 	struct intpdata *idatap,
 	int level,
-	long *execsz,
+	size_t *execsz,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
 	struct vnode *vp, *execvp = NULL;
 	proc_t *pp = ttoproc(curthread);
@@ -890,8 +927,14 @@ gexec(
 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 				args->traceinval = 1;
 		}
-		if (pp->p_proc_flag & P_PR_PTRACE)
+
+		/*
+		 * If legacy ptrace is enabled, generate the SIGTRAP.
+		 */
+		if (pp->p_proc_flag & P_PR_PTRACE) {
 			psignal(pp, SIGTRAP);
+		}
+
 		if (args->traceinval)
 			prinvalidate(&pp->p_user);
 	}
@@ -1448,7 +1491,7 @@ noexec(
     struct uarg *args,
     struct intpdata *idatap,
     int level,
-    long *execsz,
+    size_t *execsz,
     int setid,
     caddr_t exec_file,
     struct cred *cred)
@@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
 	return (0);
 }
 
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+	int error;
+
+	if (STK_AVAIL(args) < sizeof (int))
+		return (E2BIG);
+	*--args->stk_offp = args->stk_strp - args->stk_base;
+
+	if (len > STK_AVAIL(args))
+		return (E2BIG);
+	bcopy(sp, args->stk_strp, len);
+
+	args->stk_strp += len;
+
+	return (0);
+}
+
 static int
 stk_getptr(uarg_t *args, char *src, char **dst)
 {
@@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	size_t size, pad;
 	char *argv = (char *)uap->argp;
 	char *envp = (char *)uap->envp;
+	uint8_t rdata[RANDOM_LEN];
 
 	/*
 	 * Copy interpreter's name and argument to argv[0] and argv[1].
@@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	args->ne = args->na - argc;
 
 	/*
-	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
-	 * AT_SUN_EMULATOR strings to the stack.
+	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+	 * array, to the stack.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		if (args->emulator != NULL &&
 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
 			return (error);
+
+		/*
+		 * For the AT_RANDOM aux vector we provide 16 bytes of random
+		 * data.
+		 */
+		(void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+			return (error);
+
+		if (args->brand_nroot != NULL &&
+		    (error = stk_add(args, args->brand_nroot,
+		    UIO_SYSSPACE)) != 0)
+			return (error);
 	}
 
 	/*
@@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 	/*
 	 * Fill in the aux vector now that we know the user stack addresses
 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
-	 * AT_SUN_EMULATOR strings.
+	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if (args->to_model == DATAMODEL_NATIVE) {
@@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a,
 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a,
+				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+			}
 		} else {
 			auxv32_t **a = (auxv32_t **)auxvpp;
 			ADDAUX(*a,
@@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a, AT_SUN_EMULATOR,
 				    (int)(uintptr_t)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a, AT_SUN_BRAND_NROOT,
+				    (int)(uintptr_t)&ustrp[*--offp])
+			}
 		}
 	}
 
@@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		usrstack = (char *)USRSTACK32;
 	}
 
+	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+		usrstack = (char *)args->maxstack;
+
 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
 
 #if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 1b9359da47..06e0117cd6 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -138,6 +138,27 @@ rexit(int rval)
 }
 
 /*
+ * Bump the init_restarts kstat and let interested parties know about the
+ * restart.
+ */
+static void
+restart_init_notify(zone_t *zone)
+{
+	nvlist_t *nvl = NULL;
+
+	zone->zone_proc_init_restarts++;
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 &&
+	    nvlist_add_uint32(nvl, ZONE_CB_RESTARTS,
+	    zone->zone_proc_init_restarts) == 0) {
+		zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS,
+		    ZONE_EVENT_INIT_RESTART_SC, nvl);
+	}
+
+	nvlist_free(nvl);
+}
+
+/*
  * Called by proc_exit() when a zone's init exits, presumably because
  * it failed.  As long as the given zone is still in the "running"
  * state, we will re-exec() init, but first we need to reset things
@@ -230,7 +251,7 @@ restart_init(int what, int why)
 		siginfofree(lwp->lwp_curinfo);
 		lwp->lwp_curinfo = NULL;
 	}
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_FALSE);
 
 	/*
 	 * Reset both the process root directory and the current working
@@ -260,6 +281,8 @@ restart_init(int what, int why)
 	ASSERT(p == curproc);
 	(void) freectty(B_TRUE);
 
+	restart_init_notify(p->p_zone);
+
 	/*
 	 * Now exec() the new init(1M) on top of the current process.  If we
 	 * succeed, the caller will treat this like a successful system call.
@@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p)
 }
 
 /*
+ * Return true if zone's init is restarted, false if exit processing should
+ * proceeed.
+ */
+static boolean_t
+zone_init_exit(zone_t *z, int why, int what)
+{
+	/*
+	 * Typically we don't let the zone's init exit unless zone_start_init()
+	 * failed its exec, or we are shutting down the zone or the machine,
+	 * although the various flags handled within this function will control
+	 * the behavior.
+	 *
+	 * Since we are single threaded, we don't need to lock the following
+	 * accesses to zone_proc_initpid.
+	 */
+	if (z->zone_boot_err != 0 ||
+	    zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN ||
+	    zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) {
+		/*
+		 * Clear the zone's init pid and proceed with exit processing.
+		 */
+		z->zone_proc_initpid = -1;
+		return (B_FALSE);
+	}
+
+	/*
+	 * There are a variety of configuration flags on the zone to control
+	 * init exit behavior.
+	 *
+	 * If the init process should be restarted, the "zone_restart_init"
+	 * member will be set.
+	 */
+	if (!z->zone_restart_init) {
+		/*
+		 * The zone has been setup to halt when init exits.
+		 */
+		z->zone_init_status = wstat(why, what);
+		(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+		z->zone_proc_initpid = -1;
+		return (B_FALSE);
+	}
+
+	/*
+	 * At this point we know we're configured to restart init, but there
+	 * are various modifiers to that behavior.
+	 */
+
+	if (z->zone_reboot_on_init_exit) {
+		/*
+		 * Some init programs in branded zones do not tolerate a
+		 * restart in the traditional manner; setting
+		 * "zone_reboot_on_init_exit" will cause the entire zone to be
+		 * rebooted instead.
+		 */
+
+		if (z->zone_restart_init_0) {
+			/*
+			 * Some init programs in branded zones only want to
+			 * restart if they exit 0, otherwise the zone should
+			 * shutdown. Setting the "zone_restart_init_0" member
+			 * controls this behavior.
+			 */
+			if (why == CLD_EXITED && what == 0) {
+				/* Trigger a zone reboot */
+				(void) zone_kadmin(A_REBOOT, 0, NULL,
+				    zone_kcred());
+			} else {
+				/* Shutdown instead of reboot */
+				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
+				    zone_kcred());
+			}
+		} else {
+			/* Trigger a zone reboot */
+			(void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred());
+		}
+
+		z->zone_init_status = wstat(why, what);
+		z->zone_proc_initpid = -1;
+		return (B_FALSE);
+	}
+
+	if (z->zone_restart_init_0) {
+		/*
+		 * Some init programs in branded zones only want to restart if
+		 * they exit 0, otherwise the zone should shutdown. Setting the
+		 * "zone_restart_init_0" member controls this behavior.
+		 *
+		 * In this case we only restart init if it exited successfully.
+		 */
+		if (why == CLD_EXITED && what == 0 &&
+		    restart_init(what, why) == 0) {
+			return (B_TRUE);
+		}
+	} else {
+		/*
+		 * No restart modifiers on the zone, attempt to restart init.
+		 */
+		if (restart_init(what, why) == 0) {
+			return (B_TRUE);
+		}
+	}
+
+
+	/*
+	 * The restart failed, the zone will shut down.
+	 */
+	z->zone_init_status = wstat(why, what);
+	(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
+	z->zone_proc_initpid = -1;
+	return (B_FALSE);
+}
+
+/*
  * Return value:
  *   1 - exitlwps() failed, call (or continue) lwp_exit()
  *   0 - restarting init.  Return through system call path
@@ -366,45 +502,36 @@ proc_exit(int why, int what)
 	}
 	mutex_exit(&p->p_lock);
 
-	DTRACE_PROC(lwp__exit);
-	DTRACE_PROC1(exit, int, why);
+	if (p->p_pid == z->zone_proc_initpid) {
+		/* If zone's init restarts, we're done here. */
+		if (zone_init_exit(z, why, what))
+			return (0);
+	}
 
 	/*
-	 * Will perform any brand specific proc exit processing, since this
-	 * is always the last lwp, will also perform lwp_exit and free brand
-	 * data
+	 * Delay firing probes (and performing brand cleanup) until after the
+	 * zone_proc_initpid check. Cases which result in zone shutdown or
+	 * restart via zone_kadmin eventually result in a call back to
+	 * proc_exit.
 	 */
-	if (PROC_IS_BRANDED(p)) {
-		lwp_detach_brand_hdlrs(lwp);
-		brand_clearbrand(p, B_FALSE);
-	}
+	DTRACE_PROC(lwp__exit);
+	DTRACE_PROC1(exit, int, why);
 
 	/*
-	 * Don't let init exit unless zone_start_init() failed its exec, or
-	 * we are shutting down the zone or the machine.
-	 *
-	 * Since we are single threaded, we don't need to lock the
-	 * following accesses to zone_proc_initpid.
+	 * Will perform any brand specific proc exit processing. Since this
+	 * is always the last lwp, will also perform lwp exit/free and proc
+	 * exit. Brand data will be freed when the process is reaped.
 	 */
-	if (p->p_pid == z->zone_proc_initpid) {
-		if (z->zone_boot_err == 0 &&
-		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
-		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
-			if (z->zone_restart_init == B_TRUE) {
-				if (restart_init(what, why) == 0)
-					return (0);
-			} else {
-				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
-				    CRED());
-			}
-		}
-
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_proc_exit(p);
 		/*
-		 * Since we didn't or couldn't restart init, we clear
-		 * the zone's init state and proceed with exit
-		 * processing.
+		 * To ensure that b_proc_exit has access to brand-specific data
+		 * contained by the one remaining lwp, call the freelwp hook as
+		 * the last part of this clean-up process.
 		 */
-		z->zone_proc_initpid = -1;
+		BROP(p)->b_freelwp(lwp);
+		lwp_detach_brand_hdlrs(lwp);
 	}
 
 	lwp_pcb_exit();
@@ -565,7 +692,7 @@ proc_exit(int why, int what)
 		semexit(p);
 	rv = wstat(why, what);
 
-	acct(rv & 0xff);
+	acct(rv);
 	exacct_commit_proc(p, rv);
 
 	/*
@@ -658,10 +785,22 @@ proc_exit(int why, int what)
 	if ((q = p->p_child) != NULL && p != proc_init) {
 		struct proc	*np;
 		struct proc	*initp = proc_init;
+		pid_t		zone_initpid = 1;
+		struct proc	*zoneinitp = NULL;
 		boolean_t	setzonetop = B_FALSE;
 
-		if (!INGLOBALZONE(curproc))
-			setzonetop = B_TRUE;
+		if (!INGLOBALZONE(curproc)) {
+			zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+			ASSERT(MUTEX_HELD(&pidlock));
+			zoneinitp = prfind(zone_initpid);
+			if (zoneinitp != NULL) {
+				initp = zoneinitp;
+			} else {
+				zone_initpid = 1;
+				setzonetop = B_TRUE;
+			}
+		}
 
 		pgdetach(p);
 
@@ -673,7 +812,8 @@ proc_exit(int why, int what)
 			 */
 			delete_ns(q->p_parent, q);
 
-			q->p_ppid = 1;
+			q->p_ppid = zone_initpid;
+
 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 			if (setzonetop) {
 				mutex_enter(&q->p_lock);
@@ -847,8 +987,50 @@ proc_exit(int why, int what)
 
 	mutex_exit(&p->p_lock);
 	if (!evaporate) {
-		p->p_pidflag &= ~CLDPEND;
-		sigcld(p, sqp);
+		/*
+		 * The brand specific code only happens when the brand has a
+		 * function to call in place of sigcld and the parent of the
+		 * exiting process is not the global zone init. If the parent
+		 * is the global zone init, then the process was reparented,
+		 * and we don't want brand code delivering possibly strange
+		 * signals to init. Also, init is not branded, so any brand
+		 * specific exit data will not be picked up by init anyway.
+		 */
+		if (PROC_IS_BRANDED(p) &&
+		    BROP(p)->b_exit_with_sig != NULL &&
+		    p->p_ppid != 1) {
+			/*
+			 * The code for _fini that could unload the brand_t
+			 * blocks until the count of zones using the module
+			 * reaches zero. Zones decrement the refcount on their
+			 * brands only after all user tasks in that zone have
+			 * exited and been waited on. The decrement on the
+			 * brand's refcount happen in zone_destroy(). That
+			 * depends on zone_shutdown() having been completed.
+			 * zone_shutdown() includes a call to zone_empty(),
+			 * where the zone waits for itself to reach the state
+			 * ZONE_IS_EMPTY. This state is only set in either
+			 * zone_shutdown(), when there are no user processes as
+			 * the zone enters this function, or in
+			 * zone_task_rele(). zone_task_rele() is called from
+			 * code triggered by waiting on processes, not by the
+			 * processes exiting through proc_exit().  This means
+			 * all the branded processes that could exist for a
+			 * specific brand_t must exit and get reaped before the
+			 * refcount on the brand_t can reach 0. _fini will
+			 * never unload the corresponding brand module before
+			 * proc_exit finishes execution for all processes
+			 * branded with a particular brand_t, which makes the
+			 * operation below safe to do. Brands that wish to use
+			 * this mechanism must wait in _fini as described
+			 * above.
+			 */
+			BROP(p)->b_exit_with_sig(p, sqp);
+		} else {
+			p->p_pidflag &= ~CLDPEND;
+			sigcld(p, sqp);
+		}
+
 	} else {
 		/*
 		 * Do what sigcld() would do if the disposition
@@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 int
 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 {
-	int found;
 	proc_t *cp, *pp;
-	int proc_gone;
 	int waitflag = !(options & WNOWAIT);
+	boolean_t have_brand_helper = B_FALSE;
 
 	/*
 	 * Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 	pp = ttoproc(curthread);
 
 	/*
-	 * lock parent mutex so that sibling chain can be searched.
+	 * Anytime you are looking for a process, you take pidlock to prevent
+	 * things from changing as you look.
 	 */
 	mutex_enter(&pidlock);
 
@@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		return (ECHILD);
 	}
 
-	while (pp->p_child != NULL) {
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+		have_brand_helper = B_TRUE;
+	}
+
+	while (pp->p_child != NULL || have_brand_helper) {
+		boolean_t brand_wants_wait = B_FALSE;
+		int proc_gone = 0;
+		int found = 0;
+
+		/*
+		 * Give the brand a chance to return synthetic results from
+		 * this waitid() call before we do the real thing.
+		 */
+		if (have_brand_helper) {
+			int ret;
+
+			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+			    &brand_wants_wait, &ret) == 0) {
+				mutex_exit(&pidlock);
+				return (ret);
+			}
 
-		proc_gone = 0;
+			if (pp->p_child == NULL) {
+				goto no_real_children;
+			}
+		}
 
+		/*
+		 * Look for interesting children in the newstate list.
+		 */
+		VERIFY(pp->p_child != NULL);
 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 				continue;
@@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 
@@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * Wow! None of the threads on the p_sibling_ns list were
 		 * interesting threads. Check all the kids!
 		 */
-		found = 0;
 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
 			if (idtype == P_PID && id != cp->p_pid)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 			case CLD_TRAPPED:
@@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				break;
 		}
 
+no_real_children:
 		/*
 		 * If we found no interesting processes at all,
 		 * break out and return ECHILD.
 		 */
-		if (found + proc_gone == 0)
+		if (!brand_wants_wait && (found + proc_gone == 0))
 			break;
 
 		if (options & WNOHANG) {
@@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * change state while we wait, we don't wait at all.
 		 * Get out with ECHILD according to SVID.
 		 */
-		if (found == proc_gone)
+		if (!brand_wants_wait && (found == proc_gone))
 			break;
 
 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1445,12 @@ freeproc(proc_t *p)
 		p->p_killsqp = NULL;
 	}
 
+	/* Clear any remaining brand data */
+	if (PROC_IS_BRANDED(p)) {
+		brand_clearbrand(p, B_FALSE);
+	}
+
+
 	prfree(p);	/* inform /proc */
 
 	/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..41e7e63d2b 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc.
+ * Copyright 2017, Joyent Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -386,6 +386,7 @@ flist_grow(int maxfd)
 		dst->uf_flag = src->uf_flag;
 		dst->uf_busy = src->uf_busy;
 		dst->uf_portfd = src->uf_portfd;
+		dst->uf_gen = src->uf_gen;
 	}
 
 	/*
@@ -487,7 +488,7 @@ free_afd(afd_t *afd)		/* called below and from thread_free() */
 		afd->a_fd[i] = -1;
 }
 
-static void
+void
 set_active_fd(int fd)
 {
 	afd_t *afd = &curthread->t_activefd;
@@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd)
 }
 
 /*
- * Convert a user supplied file descriptor into a pointer to a file
- * structure.  Only task is to check range of the descriptor (soft
- * resource limit was enforced at open time and shouldn't be checked
- * here).
+ * Convert a user supplied file descriptor into a pointer to a file structure.
+ * Only task is to check range of the descriptor (soft resource limit was
+ * enforced at open time and shouldn't be checked here).
  */
 file_t *
-getf(int fd)
+getf_gen(int fd, uf_entry_gen_t *genp)
 {
 	uf_info_t *fip = P_FINFO(curproc);
 	uf_entry_t *ufp;
@@ -607,6 +607,9 @@ getf(int fd)
 		return (NULL);
 	}
 	ufp->uf_refcnt++;
+	if (genp != NULL) {
+		*genp = ufp->uf_gen;
+	}
 
 	set_active_fd(fd);	/* record the active file descriptor */
 
@@ -615,6 +618,12 @@ getf(int fd)
 	return (fp);
 }
 
+file_t *
+getf(int fd)
+{
+	return (getf_gen(fd, NULL));
+}
+
 /*
  * Close whatever file currently occupies the file descriptor slot
  * and install the new file, usually NULL, in the file descriptor slot.
@@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp)
 			ASSERT(ufp->uf_flag == 0);
 			fd_reserve(fip, fd, 1);
 			ufp->uf_file = newfp;
+			ufp->uf_gen++;
 			UF_EXIT(ufp);
 			mutex_exit(&fip->fi_lock);
 			return (0);
@@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
 	 */
 	cfip->fi_nfiles = nfiles = flist_minsize(pfip);
 
-	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+	cfip->fi_list = nfiles == 0 ? NULL :
+	    kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
 
 	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
 	    fd++, pufp++, cufp++) {
@@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
 		cufp->uf_alloc = pufp->uf_alloc;
 		cufp->uf_flag = pufp->uf_flag;
 		cufp->uf_busy = pufp->uf_busy;
+		cufp->uf_gen = pufp->uf_gen;
 		if (pufp->uf_file == NULL) {
 			ASSERT(pufp->uf_flag == 0);
 			if (pufp->uf_busy) {
@@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp)
 	fd_reserve(fip, fd, 1);
 	ASSERT(ufp->uf_file == NULL);
 	ufp->uf_file = fp;
+	if (fp != NULL) {
+		ufp->uf_gen++;
+	}
 	UF_EXIT(ufp);
 	mutex_exit(&fip->fi_lock);
 	return (fd);
@@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp)
 	} else {
 		UF_ENTER(ufp, fip, fd);
 		ASSERT(ufp->uf_busy);
+		ufp->uf_gen++;
 	}
 	ASSERT(ufp->uf_fpollinfo == NULL);
 	ASSERT(ufp->uf_flag == 0);
@@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp)
 			error = EBADF;
 		else {
 			vnode_t *vp = fp->f_vnode;
-			int flag = fp->f_flag |
-			    ((fp->f_flag2 & ~FEPOLLED) << 16);
+			int flag = fp->f_flag | (fp->f_flag2 << 16);
 
 			/*
 			 * BSD fcntl() FASYNC compatibility.
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index a63931459f..7e198910b4 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -84,6 +84,7 @@ static int64_t cfork(int, int, int);
 static int getproc(proc_t **, pid_t, uint_t);
 #define	GETPROC_USER	0x0
 #define	GETPROC_KERNEL	0x1
+#define	GETPROC_ZSCHED	0x2
 
 static void fork_fail(proc_t *);
 static void forklwp_fail(proc_t *);
@@ -705,7 +706,7 @@ fork_fail(proc_t *cp)
 	if (PTOU(curproc)->u_cwd)
 		refstr_rele(PTOU(curproc)->u_cwd);
 	if (PROC_IS_BRANDED(cp)) {
-		brand_clearbrand(cp, B_TRUE);
+		brand_clearbrand(cp, B_FALSE);
 	}
 }
 
@@ -754,7 +755,7 @@ forklwp_fail(proc_t *p)
 			kmem_free(t->t_door, sizeof (door_data_t));
 			t->t_door = NULL;
 		}
-		lwp_ctmpl_clear(ttolwp(t));
+		lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 
 		/*
 		 * Remove the thread from the all threads list.
@@ -791,6 +792,9 @@ extern struct as kas;
 
 /*
  * fork a kernel process.
+ *
+ * Passing a pid argument of -1 indicates that the new process should be
+ * launched as a child of 'zsched' within the zone.
  */
 int
 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
@@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 
 		ASSERT(pid != 1);
+		ASSERT(pid >= 0);
 
 		if (getproc(&p, pid, GETPROC_KERNEL) < 0)
 			return (EAGAIN);
@@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		rctl_set_t *init_set;
 		task_t *tk, *tk_old;
 		klwp_t *lwp;
+		boolean_t pzsched = B_FALSE;
+		int flag = GETPROC_USER;
+
+		/* Handle a new user-level thread as child of zsched. */
+		if (pid < 0) {
+			VERIFY(curzone != global_zone);
+			flag = GETPROC_ZSCHED;
+			pzsched = B_TRUE;
+			pid = 0;
+		}
 
-		if (getproc(&p, pid, GETPROC_USER) < 0)
+		if (getproc(&p, pid, flag) < 0)
 			return (EAGAIN);
 		/*
 		 * init creates a new task, distinct from the task
@@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
 		}
 		t = lwptot(lwp);
 
-		ctp = contract_process_fork(sys_process_tmpl, p, curproc,
+		ctp = contract_process_fork(sys_process_tmpl, p,
+		    (pzsched ? curproc->p_zone->zone_zsched : curproc),
 		    B_FALSE);
 		ASSERT(ctp != NULL);
 		if (ct != NULL)
@@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
 		return (-1);	/* no point in starting new processes */
 
-	pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	if (flags & GETPROC_ZSCHED) {
+		pp = curproc->p_zone->zone_zsched;
+	} else {
+		pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
+	}
 	task = pp->p_task;
 	proj = task->tk_proj;
 	zone = pp->p_zone;
@@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_t1_lgrpid = LGRP_NONE;
 	cp->p_tr_lgrpid = LGRP_NONE;
 
+	/* Default to native brand initially */
+	cp->p_brand = &native_brand;
+
 	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 		if (nproc == v.v_proc) {
 			CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
 	cp->p_sessp = pp->p_sessp;
 	sess_hold(pp);
-	cp->p_brand = pp->p_brand;
-	if (PROC_IS_BRANDED(pp))
-		BROP(pp)->b_copy_procdata(cp, pp);
 	cp->p_bssbase = pp->p_bssbase;
 	cp->p_brkbase = pp->p_brkbase;
 	cp->p_brksize = pp->p_brksize;
@@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	mutex_exit(&cp->p_lock);
 	mutex_exit(&pidlock);
 
+	if (PROC_IS_BRANDED(pp)) {
+		/*
+		 * The only reason why process branding should fail is when
+		 * the procedure is complicated by multiple LWPs on the scene.
+		 * With an LWP count of 0, this newly allocated process has no
+		 * reason to fail branding.
+		 */
+		VERIFY0(brand_setbrand(cp, B_FALSE));
+
+		BROP(pp)->b_copy_procdata(cp, pp);
+	}
+
 	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
 	    offsetof(contract_t, ct_ctlist));
 
@@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	 */
 	fcnt_add(P_FINFO(pp), 1);
 
+	mutex_enter(&pp->p_lock);
 	if (PTOU(pp)->u_cdir) {
 		VN_HOLD(PTOU(pp)->u_cdir);
 	} else {
@@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 		VN_HOLD(PTOU(pp)->u_rdir);
 	if (PTOU(pp)->u_cwd)
 		refstr_hold(PTOU(pp)->u_cwd);
+	mutex_exit(&pp->p_lock);
 
 	/*
 	 * copy the parent's uarea.
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index de2a4f26c4..07fd623a95 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -55,6 +55,7 @@
 #include <sys/fcntl.h>
 #include <sys/lwpchan_impl.h>
 #include <sys/nbmlock.h>
+#include <sys/brand.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 	return (0);
 }
 
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+	if (flags & _MAP_LOW32) {
+		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+		} else {
+			return ((caddr_t)_userlimit32);
+		}
+	}
+
+	return (as->a_userlimit);
+}
+
 
 /*
  * Used for MAP_ANON - fast way to get anonymous pages
@@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		return (EACCES);
 
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
 
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(as->a_proc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
@@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 #define	RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
 	!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
 
-static int
+int
 smmap_common(caddr_t *addrp, size_t len,
     int prot, int flags, struct file *fp, offset_t pos)
 {
@@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len,
 	 * If the user specified an address, do some simple checks here
 	 */
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len,
 		 */
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
-
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(curproc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- *   The id_space_t provides a simple implementation of a managed range of
- *   integer identifiers using a vmem arena.  An ID space guarantees that the
- *   next identifer returned by an allocation is larger than the previous one,
- *   unless there are no larger slots remaining in the range.  In this case,
- *   the ID space will return the first available slot in the lower part of the
- *   range (viewing the previous identifier as a partitioning element).  If no
- *   slots are available, id_alloc()/id_allocff() will sleep until an
- *   identifier becomes available.  Accordingly, id_space allocations must be
- *   initiated from contexts where sleeping is acceptable.  id_alloc_nosleep()/
- *   id_allocff_nosleep() will return -1 if no slots are available or if the
- *   system is low on memory.  If id_alloc_nosleep() fails, callers should
- *   not try to extend the ID space.  This is to avoid making a possible
- *   low-memory situation worse.
- *
- *   As an ID space is designed for representing a range of id_t's, there
- *   is a preexisting maximal range: [0, MAXUID].  ID space requests outside
- *   that range will fail on a DEBUG kernel.  The id_allocff*() functions
- *   return the first available id, and should be used when there is benefit
- *   to having a compact allocated range.
- *
- *   (Presently, the id_space_t abstraction supports only direct allocations; ID
- *   reservation, in which an ID is allocated but placed in a internal
- *   dictionary for later use, should be added when a consuming subsystem
- *   arrives.)
- */
-
-#define	ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define	ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
-	ASSERT(low >= 0);
-	ASSERT(low < high);
-
-	return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
-	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
-	vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
-	(void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
-	void *minaddr = ID_TO_ADDR(id);
-	void *maxaddr = ID_TO_ADDR(id + 1);
-
-	/*
-	 * Note that even though we're vmem_free()ing this later, it
-	 * should be OK, since there's no quantum cache.
-	 */
-	return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
-	    minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
-	vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c
index 9381019cd1..6a6f5d84ef 100644
--- a/usr/src/uts/common/os/ipc.c
+++ b/usr/src/uts/common/os/ipc.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
@@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
 	    (IPC_ZONE_USAGE(perm, service) == 0)));
 }
 
+/*
+ * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID.
+ */
+void
+ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm)
+{
+	ASSERT(service->ipcs_count > 0);
+	ASSERT(MUTEX_HELD(&service->ipcs_lock));
+
+	ipc_remove(service, perm);
+	mutex_exit(&service->ipcs_lock);
+
+	/* perform any per-service removal actions */
+	service->ipcs_rmid(perm);
+
+	ipc_rele(service, perm);
+}
 
 /*
  * Common code to perform an IPC_RMID.  Returns an errno value on
@@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
 	/*
 	 * Nothing can fail from this point on.
 	 */
-	ipc_remove(service, perm);
-	mutex_exit(&service->ipcs_lock);
-
-	/* perform any per-service removal actions */
-	service->ipcs_rmid(perm);
-
-	ipc_rele(service, perm);
+	ipc_rmsvc(service, perm);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index bc0cda418b..ed2c7fc346 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2018, Joyent, Inc.
@@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
 size_t kmem_content_log_size;	/* content log size [2% of memory] */
 size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */
 size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size;	/* zero-sized log [4 pages per CPU] */
 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
 size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */
 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */
 size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */
 size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */
 
+#ifdef DEBUG
+int kmem_warn_zerosized = 1;	/* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0;	/* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0;	/* whether to panic on zero-sized KM_SLEEP */
+
 #ifdef _LP64
 size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */
 #else
@@ -1098,6 +1108,7 @@ kmem_log_header_t	*kmem_transaction_log;
 kmem_log_header_t	*kmem_content_log;
 kmem_log_header_t	*kmem_failure_log;
 kmem_log_header_t	*kmem_slab_log;
+kmem_log_header_t	*kmem_zerosized_log;
 
 static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
 
@@ -2853,8 +2864,33 @@ kmem_alloc(size_t size, int kmflag)
 		/* fall through to kmem_cache_alloc() */
 
 	} else {
-		if (size == 0)
+		if (size == 0) {
+			if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+				return (NULL);
+
+			/*
+			 * If this is a sleeping allocation or one that has
+			 * been specified to panic on allocation failure, we
+			 * consider it to be deprecated behavior to allocate
+			 * 0 bytes.  If we have been configured to panic under
+			 * this condition, we panic; if to warn, we warn -- and
+			 * regardless, we log to the kmem_zerosized_log that
+			 * that this condition has occurred (which gives us
+			 * enough information to be able to debug it).
+			 */
+			if (kmem_panic && kmem_panic_zerosized)
+				panic("attempted to kmem_alloc() size of 0");
+
+			if (kmem_warn_zerosized) {
+				cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+				    "allocation with size of 0; "
+				    "see kmem_zerosized_log for details");
+			}
+
+			kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
 			return (NULL);
+		}
 
 		buf = vmem_alloc(kmem_oversize_arena, size,
 		    kmflag & KM_VMFLAGS);
@@ -4397,8 +4433,8 @@ kmem_init(void)
 	}
 
 	kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
 	kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+	kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
 
 	/*
 	 * Initialize STREAMS message caches so allocb() is available.
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 93c04cff8d..b09b2d3558 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -198,6 +198,9 @@ struct {
 	kstat_named_t pagesfree;
 	kstat_named_t pageslocked;
 	kstat_named_t pagestotal;
+	kstat_named_t lowmemscan;
+	kstat_named_t zonecapscan;
+	kstat_named_t nthrottle;
 } system_pages_kstat = {
 	{ "physmem",		KSTAT_DATA_ULONG },
 	{ "nalloc",		KSTAT_DATA_ULONG },
@@ -219,6 +222,9 @@ struct {
 	{ "pagesfree", 		KSTAT_DATA_ULONG },
 	{ "pageslocked", 	KSTAT_DATA_ULONG },
 	{ "pagestotal",		KSTAT_DATA_ULONG },
+	{ "low_mem_scan",	KSTAT_DATA_ULONG },
+	{ "zone_cap_scan",	KSTAT_DATA_ULONG },
+	{ "n_throttle",		KSTAT_DATA_ULONG },
 };
 
 static int header_kstat_update(kstat_t *, int);
@@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw)
 	system_pages_kstat.pageslocked.value.ul	= (ulong_t)(availrmem_initial -
 	    availrmem);
 	system_pages_kstat.pagestotal.value.ul	= (ulong_t)total_pages;
+	system_pages_kstat.lowmemscan.value.ul	= (ulong_t)low_mem_scan;
+	system_pages_kstat.zonecapscan.value.ul	= (ulong_t)zone_cap_scan;
+	system_pages_kstat.nthrottle.value.ul	= (ulong_t)n_throttle;
 	/*
 	 * pp_kernel represents total pages used by the kernel since the
 	 * startup. This formula takes into account the boottime kernel
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 6288f47bed..6f6aced619 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*
@@ -90,6 +91,7 @@
 #include <sys/pg.h>
 #include <sys/promif.h>
 #include <sys/sdt.h>
+#include <sys/ht.h>
 
 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
@@ -520,6 +522,8 @@ lgrp_main_mp_init(void)
 {
 	klgrpset_t changed;
 
+	ht_init();
+
 	/*
 	 * Update lgroup topology (if necessary)
 	 */
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..06c03dd38e 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2013 Gary Mills
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
 	 */
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
-	    "All rights reserved.\n");
+	printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n");
 #ifdef DEBUG
 	printf("DEBUG enabled\n");
 #endif
@@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc)
 
 mblk_t *
 log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg,
-	size_t size, int on_intr)
+    size_t size, int on_intr)
 {
 	mblk_t *mp = NULL;
 	mblk_t *mp2;
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index b2adae570f..341e4ae356 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -57,6 +57,8 @@
 #include <sys/lgrp.h>
 #include <sys/rctl.h>
 #include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
 #include <sys/cpc_impl.h>
 #include <sys/sdt.h>
 #include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	ret_tidhash_t *ret_tidhash = NULL;
 	int i;
 	int rctlfail = 0;
-	boolean_t branded = 0;
+	void *brand_data = NULL;
 	struct ctxop *ctx = NULL;
 
 	ASSERT(cid != sysdccid);	/* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	 */
 	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 
+	/*
+	 * If necessary, speculatively allocate lwp brand data.  This is done
+	 * ahead of time so p_lock need not be dropped during lwp branding.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+			mutex_enter(&p->p_lock);
+			err = 1;
+			atomic_inc_32(&p->p_zone->zone_ffmisc);
+			goto error;
+		}
+	}
+
 	mutex_enter(&p->p_lock);
 grow:
 	/*
@@ -630,18 +645,6 @@ grow:
 		} while (lwp_hash_lookup(p, t->t_tid) != NULL);
 	}
 
-	/*
-	 * If this is a branded process, let the brand do any necessary lwp
-	 * initialization.
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		if (BROP(p)->b_initlwp(lwp)) {
-			err = 1;
-			atomic_inc_32(&p->p_zone->zone_ffmisc);
-			goto error;
-		}
-		branded = 1;
-	}
 
 	if (t->t_tid == 1) {
 		kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
 		}
 	}
 
-	p->p_lwpcnt++;
 	t->t_waitfor = -1;
 
 	/*
@@ -696,8 +698,27 @@ grow:
 	t->t_post_sys = 1;
 
 	/*
+	 * Perform lwp branding
+	 *
+	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+	 * continuously held between when the tidhash is sized and when the lwp
+	 * is inserted into it.  Operations requiring p->p_lock to be
+	 * temporarily dropped can be performed in b_initlwp_post.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_initlwp(lwp, brand_data);
+		/*
+		 * The b_initlwp hook is expected to consume any preallocated
+		 * brand_data in a way that prepares it for deallocation by the
+		 * b_freelwp hook.
+		 */
+		brand_data = NULL;
+	}
+
+	/*
 	 * Insert the new thread into the list of all threads.
 	 */
+	p->p_lwpcnt++;
 	if ((tx = p->p_tlist) == NULL) {
 		t->t_back = t;
 		t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
 	lep->le_start = t->t_start;
 	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
 
+	/*
+	 * Complete lwp branding
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+		BROP(p)->b_initlwp_post(lwp);
+	}
+
 	lwp_fp_init(lwp);
 
 	if (state == TS_RUN) {
@@ -755,8 +783,9 @@ error:
 		if (cid != NOCLASS && bufp != NULL)
 			CL_FREE(cid, bufp);
 
-		if (branded)
-			BROP(p)->b_freelwp(lwp);
+		if (brand_data != NULL) {
+			BROP(p)->b_lwpdata_free(brand_data);
+		}
 
 		mutex_exit(&p->p_lock);
 		t->t_state = TS_FREE;
@@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+		ct_template_t *tmpl = src->lwp_ct_active[i];
+
+		/*
+		 * If the process contract template is setup to be preserved
+		 * across exec, then if we're forking, perform an implicit
+		 * template_clear now. This ensures that future children of
+		 * this child will remain in the same contract unless they're
+		 * explicitly setup differently. We know we're forking if the
+		 * two LWPs belong to different processes.
+		 */
+		if (i == CTT_PROCESS && tmpl != NULL) {
+			ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+			if (dst->lwp_procp != src->lwp_procp &&
+			    (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+				tmpl = NULL;
+		}
+
+		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
 		dst->lwp_ct_latest[i] = NULL;
+
 	}
 }
 
@@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
  * Clear an LWP's contract template state.
  */
 void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
 {
 	ct_template_t *tmpl;
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
-			ctmpl_free(tmpl);
-			lwp->lwp_ct_active[i] = NULL;
-		}
-
 		if (lwp->lwp_ct_latest[i] != NULL) {
 			contract_rele(lwp->lwp_ct_latest[i]);
 			lwp->lwp_ct_latest[i] = NULL;
 		}
+
+		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+			/*
+			 * If we're exec-ing a new program and the process
+			 * contract template is setup to be preserved across
+			 * exec, then don't clear it.
+			 */
+			if (is_exec && i == CTT_PROCESS) {
+				ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+					continue;
+			}
+
+			ctmpl_free(tmpl);
+			lwp->lwp_ct_active[i] = NULL;
+		}
 	}
 }
 
@@ -893,13 +953,6 @@ lwp_exit(void)
 	if (t->t_upimutex != NULL)
 		upimutex_cleanup();
 
-	/*
-	 * Perform any brand specific exit processing, then release any
-	 * brand data associated with the lwp
-	 */
-	if (PROC_IS_BRANDED(p))
-		BROP(p)->b_lwpexit(lwp);
-
 	lwp_pcb_exit();
 
 	mutex_enter(&p->p_lock);
@@ -943,6 +996,18 @@ lwp_exit(void)
 	DTRACE_PROC(lwp__exit);
 
 	/*
+	 * Perform any brand specific exit processing, then release any
+	 * brand data associated with the lwp
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		mutex_exit(&p->p_lock);
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_freelwp(lwp);
+		mutex_enter(&p->p_lock);
+		prbarrier(p);
+	}
+
+	/*
 	 * If the lwp is a detached lwp or if the process is exiting,
 	 * remove (lwp_hash_out()) the lwp from the lwp directory.
 	 * Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1103,7 +1168,7 @@ lwp_cleanup(void)
 	}
 	kpreempt_enable();
 
-	lwp_ctmpl_clear(ttolwp(t));
+	lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 }
 
 int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 7bc41b6954..3364d1e523 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args)
 	int error = 0, count = 0;
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
-	int brand_action;
+	int brand_action = EBA_NONE;
 
 	if (args == NULL)
 		args = "";
@@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args)
 	 */
 	sigemptyset(&curthread->t_hold);
 
-	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+	/*
+	 * Only instruct exec_common to brand the process if necessary.  It is
+	 * possible that the init process is already properly branded due to the
+	 * proc_exit -> restart_init -> exec_init call chain.
+	 */
+	if (ZONE_IS_BRANDED(p->p_zone) &&
+	    p->p_brand != p->p_zone->zone_brand) {
+		brand_action = EBA_BRAND;
+	}
 again:
 	error = exec_common((const char *)exec_fnamep,
 	    (const char **)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 3571747e9c..6be46fa422 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp)
 		 * Put pressure on pageout.
 		 */
 		page_needfree(free_get);
-		cv_signal(&proc_pageout->p_cv);
+		WAKE_PAGEOUT_SCANNER();
 
 		mutex_enter(&mhp->mh_mutex);
 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 142c10754e..0410e6f47b 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
 			}
 			if (num_segs++ == 0) {
 				/*
-				 * The p_vaddr of the first PT_LOAD segment
-				 * must either be NULL or within the first
-				 * page in order to be interpreted.
-				 * Otherwise, its an invalid file.
+				 * While ELF doesn't specify the meaning of
+				 * p_vaddr for PT_LOAD segments in ET_DYN
+				 * objects, we mandate that is either NULL or
+				 * (to accommodate some historical binaries)
+				 * within the first page.  (Note that there
+				 * exist non-native ET_DYN objects that violate
+				 * this constraint that we nonetheless must be
+				 * able to execute; see the ET_DYN handling in
+				 * mapelfexec() for details.)
 				 */
 				if (e_type == ET_DYN &&
 				    ((caddr_t)((uintptr_t)vaddr &
diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c
index e2a3335eb4..f1003f7834 100644
--- a/usr/src/uts/common/os/modctl.c
+++ b/usr/src/uts/common/os/modctl.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 Joyent, Inc.
  */
 
 /*
@@ -3470,6 +3471,11 @@ mod_load(struct modctl *mp, int usepath)
 		retval = install_stubs_by_name(mp, mp->mod_modname);
 
 		/*
+		 * Perform hotinlines before module is started.
+		 */
+		do_hotinlines(mp->mod_mp);
+
+		/*
 		 * Now that the module is loaded, we need to give DTrace
 		 * a chance to notify its providers.  This is done via
 		 * the dtrace_modload function pointer.
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 3605104ae7..a04294eed5 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
@@ -57,10 +58,12 @@ struct hwc_class *hcl_head;	/* head of list of classes */
 static kmutex_t hcl_lock;	/* for accessing list of classes */
 
 #define	DAFILE		"/etc/driver_aliases"
+#define	PPTFILE		"/etc/ppt_aliases"
 #define	CLASSFILE	"/etc/driver_classes"
 #define	DACFFILE	"/etc/dacf.conf"
 
 static char class_file[] = CLASSFILE;
+static char pptfile[] = PPTFILE;
 static char dafile[] = DAFILE;
 static char dacffile[] = DACFFILE;
 
@@ -2150,14 +2153,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props)
 	return (0);	/* always return success */
 }
 
-void
-make_aliases(struct bind **bhash)
+static void
+parse_aliases(struct bind **bhash, struct _buf *file)
 {
 	enum {
 		AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA
 	} state;
 
-	struct _buf *file;
 	char tokbuf[MAXPATHLEN];
 	char drvbuf[MAXPATHLEN];
 	token_t token;
@@ -2166,9 +2168,6 @@ make_aliases(struct bind **bhash)
 	static char dupwarn[] = "!Driver alias \"%s\" conflicts with "
 	    "an existing driver name or alias.";
 
-	if ((file = kobj_open_file(dafile)) == (struct _buf *)-1)
-		return;
-
 	state = AL_NEW;
 	major = DDI_MAJOR_T_NONE;
 	while (!done) {
@@ -2253,8 +2252,22 @@ make_aliases(struct bind **bhash)
 			kobj_file_err(CE_WARN, file, tok_err, tokbuf);
 		}
 	}
+}
 
-	kobj_close_file(file);
+void
+make_aliases(struct bind **bhash)
+{
+	struct _buf *file;
+
+	if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) {
+		parse_aliases(bhash, file);
+		kobj_close_file(file);
+	}
+
+	if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) {
+		parse_aliases(bhash, file);
+		kobj_close_file(file);
+	}
 }
 
 
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
 	return (pidp);
 }
 
+struct pid *
+pid_find(pid_t pid)
+{
+	struct pid *pidp;
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pid);
+	mutex_exit(&pidlinklock);
+
+	return (pidp);
+}
+
 void
 pid_setmin(void)
 {
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
 	THREAD_KPRI_RELEASE();
 }
 
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	THREAD_KPRI_RELEASE();
+}
+
 void
 pid_init(void)
 {
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index d3d362a8a7..861c748cff 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -56,6 +56,7 @@
 #include <sys/mntent.h>
 #include <sys/contract_impl.h>
 #include <sys/dld_ioc.h>
+#include <sys/brand.h>
 
 /*
  * There are two possible layers of privilege routines and two possible
@@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
 void
 secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
 {
+	proc_t *p = curproc;
+
+	/*
+	 * Allow the brand to override this behaviour.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+		/*
+		 * This brand hook will return 0 if handling is complete, or
+		 * some other value if the brand would like us to fall back to
+		 * the usual behaviour.
+		 */
+		if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+			return;
+		}
+	}
+
 	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
 	    secpolicy_vnode_setid_retain(cr,
 	    (vap->va_mode & S_ISUID) != 0 &&
@@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr)
 }
 
 int
+secpolicy_fs_import(const cred_t *cr)
+{
+	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
 secpolicy_pfexec_register(const cred_t *cr)
 {
 	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr)
 		return (secpolicy_net_config(cr, B_FALSE));
 	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
 }
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+		return (EPERM);
+	return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index bc1787c9ca..854fb602da 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
 	Allows a process to perform privileged mappings through a
 	graphics device.
 
+privilege PRIV_HYPRLOFS_CONTROL
+
+	Allows a process to manage hyprlofs entries.
+
 privilege PRIV_IPC_DAC_READ
 
 	Allows a process to read a System V IPC
@@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES
 	Allows a process to open the real console device directly.
 	Allows a process to open devices that have been exclusively opened.
 
+privilege PRIV_SYS_FS_IMPORT
+
+	Allows a process to import a potentially untrusted file system.
+
 privilege PRIV_SYS_IPC_CONFIG
 
 	Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index 09b80323d5..e0a1126567 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 #include <sys/atomic.h>
@@ -194,6 +195,8 @@ id_space_t *rctl_ids;
 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
 
+extern rctl_hndl_t rc_process_maxlockedmem;
+
 kmutex_t rctl_lists_lock;
 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
 
@@ -2872,12 +2875,12 @@ rctl_init(void)
  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
  *     int chargeproc)
  *
- * Increments the amount of locked memory on a project, and
- * zone. If proj is non-NULL the project must be held by the
- * caller; if it is NULL the proj and zone of proc_t p are used.
- * If chargeproc is non-zero, then the charged amount is cached
- * on p->p_locked_mem so that the charge can be migrated when a
- * process changes projects.
+ * Increments the amount of locked memory on a process, project, and
+ * zone. If 'proj' is non-NULL, the project must be held by the
+ * caller; if it is NULL, the project and zone of process 'p' are used.
+ * If 'chargeproc' is non-zero, then the charged amount is added
+ * to p->p_locked_mem. This is also used so that the charge can be
+ * migrated when a process changes projects.
  *
  * Return values
  *    0 - success
@@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 
 	ASSERT(p != NULL);
 	ASSERT(MUTEX_HELD(&p->p_lock));
+
 	if (proj != NULL) {
 		projp = proj;
 		zonep = proj->kpj_zone;
@@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
 		}
 	}
 
-	zonep->zone_locked_mem += inc;
-	projp->kpj_data.kpd_locked_mem += inc;
 	if (chargeproc != 0) {
+		/* Check for overflow */
+		if ((p->p_locked_mem + inc) < p->p_locked_mem) {
+			ret = EAGAIN;
+			goto out;
+		}
+		if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p,
+		    &e, inc, 0) & RCT_DENY) {
+			ret = EAGAIN;
+			goto out;
+		}
+
 		p->p_locked_mem += inc;
 	}
+
+	zonep->zone_locked_mem += inc;
+	projp->kpj_data.kpd_locked_mem += inc;
 out:
 	mutex_exit(&zonep->zone_mem_lock);
 	return (ret);
diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c
index 9b7324fe7b..c62540d2b4 100644
--- a/usr/src/uts/common/os/rctl_proc.c
+++ b/usr/src/uts/common/os/rctl_proc.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -32,6 +33,7 @@
 #include <sys/port_kernel.h>
 #include <sys/signal.h>
 #include <sys/var.h>
+#include <sys/policy.h>
 
 #include <sys/vmparam.h>
 #include <sys/machparam.h>
@@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl;
 rctl_hndl_t rc_process_semopm;
 rctl_hndl_t rc_process_portev;
 rctl_hndl_t rc_process_sigqueue;
+rctl_hndl_t rc_process_maxlockedmem;
 
 /*
  * process.max-cpu-time / RLIMIT_CPU
@@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = {
 };
 
 /*
+ * process.max-locked-memory
+ */
+/*ARGSUSED*/
+static int
+proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
+    struct rctl_val *rv, rctl_qty_t i, uint_t f)
+{
+	if (secpolicy_lock_memory(CRED()) == 0)
+		return (0);
+	return ((p->p_locked_mem + i) > rv->rcv_value);
+}
+
+static rctl_ops_t proc_maxlockedmem_ops = {
+	rcop_no_action,
+	rcop_no_usage,
+	rcop_no_set,
+	proc_maxlockedmem_test
+};
+
+/*
  * void rctlproc_default_init()
  *
  * Overview
@@ -383,6 +406,11 @@ rctlproc_init(void)
 	rctl_add_default_limit("process.max-sigqueue-size",
 	    _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
 
+	rc_process_maxlockedmem = rctl_register("process.max-locked-memory",
+	    RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS |
+	    RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES,
+	    ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops);
+
 	/*
 	 * Place minimal set of controls on "sched" process for inheritance by
 	 * processes created via newproc().
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
 		klwp_t *lwp = ttolwp(tp);
 
 		/*
-		 * Swapout eligible lwps (specified by the scheduling
-		 * class) which don't have TS_DONT_SWAP set.  Set the
-		 * "intent to swap" flag (TS_SWAPENQ) on threads
-		 * which have TS_DONT_SWAP set so that they can be
+		 * Swapout eligible lwps (specified by the scheduling class)
+		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap"
+		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+		 * set or are currently on a split stack so that they can be
 		 * swapped if and when they reach a safe point.
 		 */
 		thread_lock(tp);
 		thread_pri = CL_SWAPOUT(tp, swapflags);
 		if (thread_pri != -1) {
-			if (tp->t_schedflag & TS_DONT_SWAP) {
+			if ((tp->t_schedflag & TS_DONT_SWAP) ||
+			    (tp->t_flag & T_SPLITSTK)) {
 				tp->t_schedflag |= TS_SWAPENQ;
 				tp->t_trapret = 1;
 				aston(tp);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 5721083751..18b396a765 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t)
 
 
 /*
- * If the sc_sigblock field is set for the specified thread, set
- * its signal mask to block all maskable signals, then clear the
- * sc_sigblock field.  This finishes what user-level code requested
- * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
- * Called from signal-related code either by the current thread for
- * itself or by a thread that holds the process's p_lock (/proc code).
+ * If the sc_sigblock field is set for the specified thread, set its signal
+ * mask to block all maskable signals, then clear the sc_sigblock field.  This
+ * accomplishes what user-level code requested to be done when it set
+ * tdp->sc_shared->sc_sigblock non-zero.
+ *
+ * This is generally called by signal-related code in the current thread.  In
+ * order to call against a thread other than curthread, p_lock for the
+ * containing process must be held.  Even then, the caller is not protected
+ * from races with the thread in question updating its own fields.  It is the
+ * responsibility of the caller to perform additional synchronization.
+ *
  */
 void
 schedctl_finish_sigblock(kthread_t *t)
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index bacc595f78..5deae96d73 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 		size_t	share_size;
 		struct	shm_data ssd;
 		uintptr_t align_hint;
+		long	curprot;
 
 		/*
 		 * Pick a share pagesize to use, if (!isspt(sp)).
@@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 		}
 
+		curprot = sp->shm_opts & SHM_PROT_MASK;
 		if (!isspt(sp)) {
 			error = sptcreate(size, &segspt, sp->shm_amp, prot,
 			    flags, share_szc);
@@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			}
 			sp->shm_sptinfo->sptas = segspt->s_as;
 			sp->shm_sptseg = segspt;
-			sp->shm_sptprot = prot;
-		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
+			sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot;
+		} else if ((prot & curprot) != curprot) {
 			/*
 			 * Ensure we're attaching to an ISM segment with
 			 * fewer or equal permissions than what we're
@@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg)
 		}
 		break;
 
+	/* Stage segment for removal, but don't remove until last detach */
+	case SHM_RMID:
+		if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0)
+			break;
+
+		/*
+		 * If attached, just mark it as a pending remove, otherwise
+		 * we must perform the normal ipc_rmid now.
+		 */
+		if ((sp->shm_perm.ipc_ref - 1) > 0) {
+			sp->shm_opts |= SHM_RM_PENDING;
+		} else {
+			mutex_exit(lock);
+			return (ipc_rmid(shm_svc, shmid, cr));
+		}
+		break;
+
 	default:
 		error = EINVAL;
 		break;
@@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap)
 		sp->shm_ismattch--;
 	sp->shm_dtime = gethrestime_sec();
 	sp->shm_lpid = pp->p_pid;
+	if ((sp->shm_opts & SHM_RM_PENDING) != 0 &&
+	    sp->shm_perm.ipc_ref == 2) {
+		/*
+		 * If this is the last detach of the segment across the whole
+		 * system then now we can perform the delayed IPC_RMID.
+		 * The ipc_ref count has 1 for the original 'get' and one for
+		 * each 'attach' (see 'stat' handling in shmctl).
+		 */
+		sp->shm_opts &= ~SHM_RM_PENDING;
+		mutex_enter(&shm_svc->ipcs_lock);
+		ipc_rmsvc(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
+		ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock));
+		ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0);
+
+		/* Lock was dropped, need to retake it for following rele. */
+		(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
+	}
 	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
 
 	kmem_free(sap, sizeof (segacct_t));
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..67a93581dd 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/cyclic.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 #include <sys/signalfd.h>
 
 const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
 }
 
 /*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */
+	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */
+	    BROP(p)->b_sig_ignorable != NULL &&
+	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
  * Return true if the signal can safely be discarded on generation.
  * That is, if there is no need for the signal on the receiving end.
  * The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
  *	the signal is not being accepted via sigwait()
  */
 static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
 {
 	kthread_t *t = p->p_tlist;
+	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
 
 	return (t == NULL ||		/* if zombie or ... */
-	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */
+	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */
 	    t->t_forw == t &&			/* and single-threaded */
 	    !tracing(p, sig) &&			/* and no /proc tracing */
 	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
 		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
 			ttoproc(t)->p_stopsig = 0;
 			t->t_dtrace_stop = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 		} else if (t != curthread && t->t_state == TS_ONPROC) {
 			aston(t);	/* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
 		}
 	}
 
-	if (sig_discardable(p, sig)) {
+	if (sig_discardable(p, t, sig)) {
 		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
 		    proc_t *, p, int, sig);
 		return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
 			if (sigismember(&set, sig) &&
 			    (tracing(p, sig) ||
 			    sigismember(&t->t_sigwait, sig) ||
-			    !sigismember(&p->p_ignore, sig))) {
+			    !sig_ignorable(p, lwp, sig))) {
 				/*
 				 * Don't promote a signal that will stop
 				 * the process when lwp_nostop is set.
@@ -623,6 +640,28 @@ issig_forreal(void)
 		}
 
 		/*
+		 * The brand hook name 'b_issig_stop' is a misnomer.
+		 * Allow the brand the chance to alter (or suppress) delivery
+		 * of this signal.
+		 */
+		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+			int r;
+
+			/*
+			 * The brand hook will return 0 if it would like
+			 * us to drive on, -1 if we should restart
+			 * the loop to check other conditions, or 1 if we
+			 * should terminate the loop.
+			 */
+			r = BROP(p)->b_issig_stop(p, lwp);
+			if (r < 0) {
+				continue;
+			} else if (r > 0) {
+				break;
+			}
+		}
+
+		/*
 		 * Honor requested stop before dealing with the
 		 * current signal; a debugger may change it.
 		 * Do not want to go back to loop here since this is a special
@@ -656,7 +695,7 @@ issig_forreal(void)
 			lwp->lwp_cursig = 0;
 			lwp->lwp_extsig = 0;
 			if (sigismember(&t->t_sigwait, sig) ||
-			    (!sigismember(&p->p_ignore, sig) &&
+			    (!sig_ignorable(p, lwp, sig) &&
 			    !isjobstop(sig))) {
 				if (p->p_flag & (SEXITLWPS|SKILLED)) {
 					sig = SIGKILL;
@@ -708,7 +747,7 @@ issig_forreal(void)
 				toproc = 0;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&t->t_extsig, sig))
 						ext = 1;
 					break;
@@ -722,7 +761,7 @@ issig_forreal(void)
 				toproc = 1;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&p->p_extsig, sig))
 						ext = 1;
 					break;
@@ -954,6 +993,16 @@ stop(int why, int what)
 		}
 		break;
 
+	case PR_BRAND:
+		/*
+		 * We have been stopped by the brand code for a brand-private
+		 * reason.  This is an asynchronous stop affecting only this
+		 * LWP.
+		 */
+		VERIFY(PROC_IS_BRANDED(p));
+		flags &= ~TS_BSTART;
+		break;
+
 	default:	/* /proc stop */
 		flags &= ~TS_PSTART;
 		/*
@@ -1065,7 +1114,7 @@ stop(int why, int what)
 		}
 	}
 
-	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
 		/*
 		 * Do process-level notification when all lwps are
 		 * either stopped on events of interest to /proc
@@ -1171,6 +1220,13 @@ stop(int why, int what)
 	if (why == PR_CHECKPOINT)
 		del_one_utstop();
 
+	/*
+	 * Allow the brand to post notification of this stop condition.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+		BROP(p)->b_stop_notify(p, lwp, why, what);
+	}
+
 	thread_lock(t);
 	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
 	t->t_schedflag |= flags;
@@ -1192,7 +1248,7 @@ stop(int why, int what)
 		    (p->p_flag & (SEXITLWPS|SKILLED))) {
 			p->p_stopsig = 0;
 			thread_lock(t);
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 			thread_unlock_nopreempt(t);
 		} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1383,7 @@ psig(void)
 	 * this signal from pending to current (we dropped p->p_lock).
 	 * This can happen only in a multi-threaded process.
 	 */
-	if (sigismember(&p->p_ignore, sig) ||
+	if (sig_ignorable(p, lwp, sig) ||
 	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {
 		lwp->lwp_cursig = 0;
 		lwp->lwp_extsig = 0;
@@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
 			/*
 			 * This can only happen when the parent is init.
 			 * (See call to sigcld(q, NULL) in exit().)
-			 * Use KM_NOSLEEP to avoid deadlock.
+			 * Use KM_NOSLEEP to avoid deadlock. The child procs
+			 * initpid can be 1 for zlogin.
 			 */
-			ASSERT(pp == proc_init);
+			ASSERT(pp->p_pidp->pid_id ==
+			    cp->p_zone->zone_proc_initpid ||
+			    pp->p_pidp->pid_id == 1);
 			winfo(cp, &info, 0);
 			sigaddq(pp, NULL, &info, KM_NOSLEEP);
 		} else {
@@ -1804,6 +1863,15 @@ sigcld_repost()
 
 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 	mutex_enter(&pidlock);
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+		/*
+		 * Allow the brand to inject synthetic SIGCLD signals.
+		 */
+		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+			mutex_exit(&pidlock);
+			return;
+		}
+	}
 	for (cp = pp->p_child; cp; cp = cp->p_sibling) {
 		if (cp->p_pidflag & CLDPEND) {
 			post_sigcld(cp, sqp);
@@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(sig >= 1 && sig < NSIG);
 
-	if (sig_discardable(p, sig))
+	if (sig_discardable(p, t, sig))
 		siginfofree(sigqp);
 	else
 		sigaddqins(p, t, sigqp);
@@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
 	 * blocking the signal (it *could* change it's mind while
 	 * the signal is pending) then don't bother creating one.
 	 */
-	if (!sig_discardable(p, sig) &&
+	if (!sig_discardable(p, t, sig) &&
 	    (sigismember(&p->p_siginfo, sig) ||
 	    (curproc->p_ct_process != p->p_ct_process) ||
 	    (sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/smbios_impl.h>
 #include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
 void *
 smb_alloc(size_t len)
 {
-	return (kmem_alloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
 }
 
 void *
 smb_zalloc(size_t len)
 {
-	return (kmem_zalloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
 }
 
 void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index d4c2f7023d..68afeef013 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -78,6 +78,7 @@
 #include <sys/policy.h>
 #include <sys/dld.h>
 #include <sys/zone.h>
+#include <sys/limits.h>
 #include <c2/audit.h>
 
 /*
@@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		 * (registered in sd_wakeq).
 		 */
 		struiod_t uiod;
+		struct iovec buf[IOV_MAX_STACK];
+		int iovlen = 0;
 
 		if (first)
 			stp->sd_wakeq &= ~RSLEEP;
 
-		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+		if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+			iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+		} else {
+			uiod.d_iov = buf;
+		}
+
+		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 		uiod.d_mp = 0;
 		/*
 		 * Mark that a thread is in rwnext on the read side
@@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 			if ((bp = uiod.d_mp) != NULL) {
 				*errorp = 0;
 				ASSERT(MUTEX_HELD(&stp->sd_lock));
+				if (iovlen != 0)
+					kmem_free(uiod.d_iov, iovlen);
 				return (bp);
 			}
 			error = 0;
@@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		} else {
 			*errorp = error;
 			ASSERT(MUTEX_HELD(&stp->sd_lock));
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (NULL);
 		}
+
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
+
 		/*
 		 * Try a getq in case a rwnext() generated mblk
 		 * has bubbled up via strrput().
@@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
     int b_flag, int pri, int flags)
 {
 	struiod_t uiod;
+	struct iovec buf[IOV_MAX_STACK];
+	int iovlen = 0;
 	mblk_t *mp;
 	queue_t *wqp = stp->sd_wrq;
 	int error = 0;
@@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	mp->b_flag |= b_flag;
 	mp->b_band = (uchar_t)pri;
 
-	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-	    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+	if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+		iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+		uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+	} else {
+		uiod.d_iov = buf;
+	}
+
+	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 	uiod.d_uio.uio_offset = 0;
 	uiod.d_mp = mp;
 	error = rwnext(wqp, &uiod);
 	if (! uiod.d_mp) {
 		uioskip(uiop, *iosize);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	ASSERT(mp == uiod.d_mp);
@@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		error = 0;
 	} else {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	/* Have to check canput before consuming data from the uio */
 	if (pri == 0) {
 		if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	} else {
 		if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	}
@@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	/* Copyin data from the uio */
 	if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	uioskip(uiop, *iosize);
@@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		putnext(wqp, mp);
 		stream_runservice(stp);
 	}
+	if (iovlen != 0)
+		kmem_free(uiod.d_iov, iovlen);
 	return (0);
 }
 
@@ -3179,6 +3216,7 @@ job_control_type(int cmd)
 	case JAGENT:	/* Obsolete */
 	case JTRUN:	/* Obsolete */
 	case JXTPROTO:	/* Obsolete */
+	case TIOCSETLD:
 		return (JCSETP);
 	}
 
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 1ffb561428..ac1ee2d1ce 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -26,6 +26,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  */
 
@@ -8470,6 +8471,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src)
 		dbp->db_cpid = cpid;
 }
 
+
+/*
+ * Now that NIC drivers are expected to deal only with M_DATA mblks, the
+ * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their
+ * respective mac_hcksum_set and mac_hcksum_get counterparts.
+ */
 int
 hcksum_assoc(mblk_t *mp,  multidata_t *mmd, pdesc_t *pd,
     uint32_t start, uint32_t stuff, uint32_t end, uint32_t value,
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index c39819156d..e0cc20fa45 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -5903,6 +5903,12 @@ ddi_ffs(long mask)
 	return (ffs(mask));
 }
 
+int
+ddi_ffsll(long long mask)
+{
+	return (ffs(mask));
+}
+
 /*
  * Find last bit set. Take mask and clear
  * all but the most significant bit, and
@@ -5914,8 +5920,14 @@ ddi_ffs(long mask)
 int
 ddi_fls(long mask)
 {
+	return (ddi_flsll(mask));
+}
+
+int
+ddi_flsll(long long mask)
+{
 	while (mask) {
-		long nx;
+		long long nx;
 
 		if ((nx = (mask & (mask - 1))) == 0)
 			break;
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index fb8bf07077..fb64000e4d 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -23,6 +23,7 @@
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2012 Milan Jurik. All rights reserved.
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2018, Joyent, Inc.
  */
 
@@ -61,8 +62,7 @@ struct mmaplf32a;
 int	access(char *, int);
 int	alarm(int);
 int	auditsys(struct auditcalls *, rval_t *);
-int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t,
-    uintptr_t);
+int64_t	brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 intptr_t	brk(caddr_t);
 int	chdir(char *);
 int	chmod(char *, int);
@@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] =
 			SYSENT_NOSYS(),
 			SYSENT_C("llseek",	llseek32,	4)),
 	/* 176 */ SYSENT_LOADABLE(),		/* inst_sync */
-	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6),
+	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),
 	/* 178 */ SYSENT_LOADABLE(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
@@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] =
 	/* 174 */ SYSENT_CI("pwrite",		pwrite32,		4),
 	/* 175 */ SYSENT_C("llseek",		llseek32,	4),
 	/* 176 */ SYSENT_LOADABLE32(),		/* inst_sync */
-	/* 177 */ SYSENT_CI("brandsys",		brandsys,	6),
+	/* 177 */ SYSENT_CI("brandsys",		brandsys,	5),
 	/* 178 */ SYSENT_LOADABLE32(),		/* kaio */
 	/* 179 */ SYSENT_LOADABLE32(),		/* cpc */
 	/* 180 */ SYSENT_CI("lgrpsys",		lgrpsys,	3),
@@ -1094,18 +1094,20 @@ char **syscallnames;
 
 systrace_sysent_t *systrace_sysent;
 void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 /*ARGSUSED*/
 void
 systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7)
 {}
 
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+		    arg6, arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+	    arg6, arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32;
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+		    arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+	    arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void)
 	}
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
-		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
 }
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index b25a6cbcf1..5453ebf380 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,11 +25,12 @@
  */
 
 /*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
  */
 
 #include <sys/timer.h>
 #include <sys/systm.h>
+#include <sys/sysmacros.h>
 #include <sys/param.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
@@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it)
  * waiters.  p_lock must be held on entry; it will not be dropped by
  * timer_unlock().
  */
+/* ARGSUSED */
 static void
 timer_unlock(proc_t *p, itimer_t *it)
 {
@@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 		timer_lock(p, it);
 	}
 
+	ASSERT(p->p_itimer_sz > tid);
 	ASSERT(p->p_itimer[tid] == it);
 	p->p_itimer[tid] = NULL;
 
@@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 
 	it->it_backend->clk_timer_delete(it);
 
-	if (it->it_portev) {
+	if (it->it_flags & IT_PORT) {
 		mutex_enter(&it->it_mutex);
 		if (it->it_portev) {
 			port_kevent_t	*pev;
@@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 static itimer_t *
 timer_grab(proc_t *p, timer_t tid)
 {
-	itimer_t **itp, *it;
+	itimer_t *it;
 
-	if (tid >= timer_max || tid < 0)
+	if (tid < 0) {
 		return (NULL);
+	}
 
 	mutex_enter(&p->p_lock);
-
-	if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) {
+	if (p->p_itimer == NULL || tid >= p->p_itimer_sz ||
+	    (it = p->p_itimer[tid]) == NULL) {
 		mutex_exit(&p->p_lock);
 		return (NULL);
 	}
 
+	/* This may drop p_lock temporarily. */
 	timer_lock(p, it);
 
 	if (it->it_lock & ITLK_REMOVE) {
@@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
  * should not be held on entry; timer_release() will acquire p_lock but
  * will drop it before returning.
  */
-static void
+void
 timer_release(proc_t *p, itimer_t *it)
 {
 	mutex_enter(&p->p_lock);
@@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
  * p_lock should not be held on entry; timer_delete_grabbed() will acquire
  * p_lock, but will drop it before returning.
  */
-static void
+void
 timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
 {
 	mutex_enter(&p->p_lock);
@@ -258,6 +263,13 @@ clock_timer_init()
 {
 	clock_timer_cache = kmem_cache_create("timer_cache",
 	    sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Push the timer_max limit up to at least 4 * NCPU.  Due to the way
+	 * NCPU is defined, proper initialization of the timer limit is
+	 * performed at runtime.
+	 */
+	timer_max = MAX(NCPU * 4, timer_max);
 }
 
 void
@@ -453,6 +465,9 @@ timer_fire(itimer_t *it)
 			it->it_pending = 1;
 			port_send_event((port_kevent_t *)it->it_portev);
 			mutex_exit(&it->it_mutex);
+		} else if (it->it_flags & IT_CALLBACK) {
+			it->it_cb_func(it);
+			ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
 		} else if (it->it_flags & IT_SIGNAL) {
 			it->it_pending = 1;
 			mutex_exit(&it->it_mutex);
@@ -466,159 +481,175 @@ timer_fire(itimer_t *it)
 		mutex_exit(&p->p_lock);
 }
 
-int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+/*
+ * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
+ * Acquires p_lock and holds it on return, regardless of success.
+ */
+static itimer_t *
+timer_alloc(proc_t *p, timer_t *id)
 {
-	struct sigevent ev;
-	proc_t *p = curproc;
-	clock_backend_t *backend;
-	itimer_t *it, **itp;
-	sigqueue_t *sigq;
-	cred_t *cr = CRED();
-	int error = 0;
-	timer_t i;
-	port_notify_t tim_pnevp;
-	port_kevent_t *pkevp = NULL;
+	itimer_t *it, **itp = NULL;
+	uint_t i;
 
-	if ((backend = CLOCK_BACKEND(clock)) == NULL)
-		return (set_errno(EINVAL));
+	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
 
-	if (evp != NULL) {
-		/*
-		 * short copyin() for binary compatibility
-		 * fetch oldsigevent to determine how much to copy in.
-		 */
-		if (get_udatamodel() == DATAMODEL_NATIVE) {
-			if (copyin(evp, &ev, sizeof (struct oldsigevent)))
-				return (set_errno(EFAULT));
+	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+	bzero(it, sizeof (itimer_t));
+	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
 
-			if (ev.sigev_notify == SIGEV_PORT ||
-			    ev.sigev_notify == SIGEV_THREAD) {
-				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
-				    sizeof (port_notify_t)))
-					return (set_errno(EFAULT));
+	mutex_enter(&p->p_lock);
+retry:
+	if (p->p_itimer != NULL) {
+		for (i = 0; i < p->p_itimer_sz; i++) {
+			if (p->p_itimer[i] == NULL) {
+				itp = &(p->p_itimer[i]);
+				break;
 			}
-#ifdef	_SYSCALL32_IMPL
-		} else {
-			struct sigevent32 ev32;
-			port_notify32_t tim_pnevp32;
+		}
+	}
 
-			if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
-				return (set_errno(EFAULT));
-			ev.sigev_notify = ev32.sigev_notify;
-			ev.sigev_signo = ev32.sigev_signo;
+	/*
+	 * A suitable slot was not found.  If possible, allocate (or resize)
+	 * the p_itimer array and try again.
+	 */
+	if (itp == NULL) {
+		uint_t target_sz = _TIMER_ALLOC_INIT;
+		itimer_t **itp_new;
+
+		if (p->p_itimer != NULL) {
+			ASSERT(p->p_itimer_sz != 0);
+
+			target_sz = p->p_itimer_sz * 2;
+		}
+		/*
+		 * Protect against exceeding the max or overflow
+		 */
+		if (target_sz > timer_max || target_sz > INT_MAX ||
+		    target_sz < p->p_itimer_sz) {
+			kmem_cache_free(clock_timer_cache, it);
+			return (NULL);
+		}
+		mutex_exit(&p->p_lock);
+		itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
+		    KM_SLEEP);
+		mutex_enter(&p->p_lock);
+		if (target_sz <= p->p_itimer_sz) {
 			/*
-			 * See comment in sigqueue32() on handling of 32-bit
-			 * sigvals in a 64-bit kernel.
+			 * A racing thread performed the resize while we were
+			 * waiting outside p_lock.  Discard our now-useless
+			 * allocation and retry.
 			 */
-			ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
-			if (ev.sigev_notify == SIGEV_PORT ||
-			    ev.sigev_notify == SIGEV_THREAD) {
-				if (copyin((void *)(uintptr_t)
-				    ev32.sigev_value.sival_ptr,
-				    (void *)&tim_pnevp32,
-				    sizeof (port_notify32_t)))
-					return (set_errno(EFAULT));
-				tim_pnevp.portnfy_port =
-				    tim_pnevp32.portnfy_port;
-				tim_pnevp.portnfy_user =
-				    (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+			kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+			goto retry;
+		} else {
+			/*
+			 * Instantiate the larger allocation and select the
+			 * first fresh entry for use.
+			 */
+			if (p->p_itimer != NULL) {
+				uint_t old_sz;
+
+				old_sz = p->p_itimer_sz;
+				bcopy(p->p_itimer, itp_new,
+				    old_sz * sizeof (itimer_t *));
+				kmem_free(p->p_itimer,
+				    old_sz * sizeof (itimer_t *));
+
+				/*
+				 * Short circuit to use the first free entry in
+				 * the new allocation.  It's possible that
+				 * other lower-indexed timers were freed while
+				 * p_lock was dropped, but skipping over them
+				 * is not harmful at all.  In the common case,
+				 * we skip the need to walk over an array
+				 * filled with timers before arriving at the
+				 * slot we know is fresh from the allocation.
+				 */
+				i = old_sz;
+			} else {
+				/*
+				 * For processes lacking any existing timers,
+				 * we can simply select the first entry.
+				 */
+				i = 0;
 			}
-#endif
+			p->p_itimer = itp_new;
+			p->p_itimer_sz = target_sz;
 		}
-		switch (ev.sigev_notify) {
-		case SIGEV_NONE:
-			break;
-		case SIGEV_SIGNAL:
-			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
-				return (set_errno(EINVAL));
-			break;
-		case SIGEV_THREAD:
-		case SIGEV_PORT:
-			break;
-		default:
-			return (set_errno(EINVAL));
-		}
-	} else {
-		/*
-		 * Use the clock's default sigevent (this is a structure copy).
-		 */
-		ev = backend->clk_default;
 	}
 
+	ASSERT(i <= INT_MAX);
+	*id = (timer_t)i;
+	return (it);
+}
+
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend.  Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete().  This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
+int
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+    itimer_t **itp, timer_t *tidp)
+{
+	proc_t *p = curproc;
+	int error = 0;
+	itimer_t *it;
+	sigqueue_t *sigq;
+	timer_t tid;
+
 	/*
-	 * We'll allocate our timer and sigqueue now, before we grab p_lock.
-	 * If we can't find an empty slot, we'll free them before returning.
+	 * We'll allocate our sigqueue now, before we grab p_lock.
+	 * If we can't find an empty slot, we'll free it before returning.
 	 */
-	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
-	bzero(it, sizeof (itimer_t));
-	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
 	sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
-	mutex_enter(&p->p_lock);
-
 	/*
-	 * If this is this process' first timer, we need to attempt to allocate
-	 * an array of timerstr_t pointers.  We drop p_lock to perform the
-	 * allocation; if we return to discover that p_itimer is non-NULL,
-	 * we will free our allocation and drive on.
+	 * Allocate a timer and choose a slot for it. This acquires p_lock.
 	 */
-	if ((itp = p->p_itimer) == NULL) {
-		mutex_exit(&p->p_lock);
-		itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP);
-		mutex_enter(&p->p_lock);
-
-		if (p->p_itimer == NULL)
-			p->p_itimer = itp;
-		else {
-			kmem_free(itp, timer_max * sizeof (itimer_t *));
-			itp = p->p_itimer;
-		}
-	}
-
-	for (i = 0; i < timer_max && itp[i] != NULL; i++)
-		continue;
+	it = timer_alloc(p, &tid);
+	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if (i == timer_max) {
-		/*
-		 * We couldn't find a slot.  Drop p_lock, free the preallocated
-		 * timer and sigqueue, and return an error.
-		 */
+	if (it == NULL) {
 		mutex_exit(&p->p_lock);
-		kmem_cache_free(clock_timer_cache, it);
 		kmem_free(sigq, sizeof (sigqueue_t));
-
-		return (set_errno(EAGAIN));
+		return (EAGAIN);
 	}
 
-	ASSERT(i < timer_max && itp[i] == NULL);
+	ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
+	ASSERT(evp != NULL);
 
 	/*
 	 * If we develop other notification mechanisms, this will need
 	 * to call into (yet another) backend.
 	 */
-	sigq->sq_info.si_signo = ev.sigev_signo;
-	if (evp == NULL)
-		sigq->sq_info.si_value.sival_int = i;
-	else
-		sigq->sq_info.si_value = ev.sigev_value;
+	sigq->sq_info.si_signo = evp->sigev_signo;
+	sigq->sq_info.si_value = evp->sigev_value;
 	sigq->sq_info.si_code = SI_TIMER;
 	sigq->sq_info.si_pid = p->p_pid;
 	sigq->sq_info.si_ctid = PRCTID(p);
 	sigq->sq_info.si_zoneid = getzoneid();
-	sigq->sq_info.si_uid = crgetruid(cr);
+	sigq->sq_info.si_uid = crgetruid(CRED());
 	sigq->sq_func = timer_signal;
 	sigq->sq_next = NULL;
 	sigq->sq_backptr = it;
 	it->it_sigq = sigq;
 	it->it_backend = backend;
 	it->it_lock = ITLK_LOCKED;
-	itp[i] = it;
 
-
-	if (ev.sigev_notify == SIGEV_THREAD ||
-	    ev.sigev_notify == SIGEV_PORT) {
+	if (evp->sigev_notify == SIGEV_THREAD ||
+	    evp->sigev_notify == SIGEV_PORT) {
 		int port;
+		port_kevent_t *pkevp = NULL;
+
+		ASSERT(pnp != NULL);
 
 		/*
 		 * This timer is programmed to use event port notification when
@@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 		 */
 
 		it->it_flags |= IT_PORT;
-		port = tim_pnevp.portnfy_port;
+		port = pnp->portnfy_port;
 
 		/* associate timer as event source with the port */
 		error = port_associate_ksource(port, PORT_SOURCE_TIMER,
 		    (port_source_t **)&it->it_portsrc, timer_close_port,
 		    (void *)it, NULL);
 		if (error) {
-			itp[i] = NULL;		/* clear slot */
 			mutex_exit(&p->p_lock);
 			kmem_cache_free(clock_timer_cache, it);
 			kmem_free(sigq, sizeof (sigqueue_t));
-			return (set_errno(error));
+			return (error);
 		}
 
 		/* allocate an event structure/slot */
@@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 		if (error) {
 			(void) port_dissociate_ksource(port, PORT_SOURCE_TIMER,
 			    (port_source_t *)it->it_portsrc);
-			itp[i] = NULL;		/* clear slot */
 			mutex_exit(&p->p_lock);
 			kmem_cache_free(clock_timer_cache, it);
 			kmem_free(sigq, sizeof (sigqueue_t));
-			return (set_errno(error));
+			return (error);
 		}
 
 		/* initialize event data */
-		port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+		port_init_event(pkevp, tid, pnp->portnfy_user,
 		    timer_port_callback, it);
 		it->it_portev = pkevp;
 		it->it_portfd = port;
 	} else {
-		if (ev.sigev_notify == SIGEV_SIGNAL)
+		if (evp->sigev_notify == SIGEV_SIGNAL)
 			it->it_flags |= IT_SIGNAL;
 	}
 
+	/* Populate the slot now that the timer is prepped. */
+	p->p_itimer[tid] = it;
 	mutex_exit(&p->p_lock);
 
 	/*
@@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
 	it->it_lwp = ttolwp(curthread);
 	it->it_proc = p;
 
-	if (copyout(&i, tid, sizeof (timer_t)) != 0) {
-		error = EFAULT;
-		goto err;
-	}
-
-	/*
-	 * If we're here, then we have successfully created the timer; we
-	 * just need to release the timer and return.
-	 */
-	timer_release(p, it);
-
+	*itp = it;
+	*tidp = tid;
 	return (0);
 
 err:
@@ -708,11 +730,115 @@ err:
 	 * impossible for a removal to be pending.
 	 */
 	ASSERT(!(it->it_lock & ITLK_REMOVE));
-	timer_delete_grabbed(p, i, it);
+	timer_delete_grabbed(p, tid, it);
 
-	return (set_errno(error));
+	return (error);
 }
 
+
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+	int error = 0;
+	proc_t *p = curproc;
+	clock_backend_t *backend;
+	struct sigevent ev;
+	itimer_t *it;
+	timer_t tid;
+	port_notify_t tim_pnevp;
+
+	if ((backend = CLOCK_BACKEND(clock)) == NULL)
+		return (set_errno(EINVAL));
+
+	if (evp != NULL) {
+		/*
+		 * short copyin() for binary compatibility
+		 * fetch oldsigevent to determine how much to copy in.
+		 */
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+				return (set_errno(EFAULT));
+
+			if (ev.sigev_notify == SIGEV_PORT ||
+			    ev.sigev_notify == SIGEV_THREAD) {
+				if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+				    sizeof (port_notify_t)))
+					return (set_errno(EFAULT));
+			}
+#ifdef	_SYSCALL32_IMPL
+		} else {
+			struct sigevent32 ev32;
+			port_notify32_t tim_pnevp32;
+
+			if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+				return (set_errno(EFAULT));
+			ev.sigev_notify = ev32.sigev_notify;
+			ev.sigev_signo = ev32.sigev_signo;
+			/*
+			 * See comment in sigqueue32() on handling of 32-bit
+			 * sigvals in a 64-bit kernel.
+			 */
+			ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+			if (ev.sigev_notify == SIGEV_PORT ||
+			    ev.sigev_notify == SIGEV_THREAD) {
+				if (copyin((void *)(uintptr_t)
+				    ev32.sigev_value.sival_ptr,
+				    (void *)&tim_pnevp32,
+				    sizeof (port_notify32_t)))
+					return (set_errno(EFAULT));
+				tim_pnevp.portnfy_port =
+				    tim_pnevp32.portnfy_port;
+				tim_pnevp.portnfy_user =
+				    (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+			}
+#endif
+		}
+		switch (ev.sigev_notify) {
+		case SIGEV_NONE:
+			break;
+		case SIGEV_SIGNAL:
+			if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+				return (set_errno(EINVAL));
+			break;
+		case SIGEV_THREAD:
+		case SIGEV_PORT:
+			break;
+		default:
+			return (set_errno(EINVAL));
+		}
+	} else {
+		/*
+		 * Use the clock's default sigevent (this is a structure copy).
+		 */
+		ev = backend->clk_default;
+	}
+
+	if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+		return (set_errno(error));
+	}
+
+	/*
+	 * Populate si_value with the timer ID if no sigevent was passed in.
+	 */
+	if (evp == NULL) {
+		it->it_sigq->sq_info.si_value.sival_int = tid;
+	}
+
+	if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+		timer_delete_grabbed(p, tid, it);
+		return (set_errno(EFAULT));
+	}
+
+	/*
+	 * If we're here, then we have successfully created the timer; we
+	 * just need to release the timer and return.
+	 */
+	timer_release(p, it);
+
+	return (0);
+}
+
+
 int
 timer_gettime(timer_t tid, itimerspec_t *val)
 {
@@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid)
 void
 timer_lwpexit(void)
 {
-	timer_t i;
+	uint_t i;
 	proc_t *p = curproc;
 	klwp_t *lwp = ttolwp(curthread);
-	itimer_t *it, **itp;
+	itimer_t *it;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if ((itp = p->p_itimer) == NULL)
+	if (p->p_itimer == NULL) {
 		return;
+	}
 
-	for (i = 0; i < timer_max; i++) {
-		if ((it = itp[i]) == NULL)
+	for (i = 0; i < p->p_itimer_sz; i++) {
+		if ((it = p->p_itimer[i]) == NULL) {
 			continue;
+		}
 
+		/* This may drop p_lock temporarily. */
 		timer_lock(p, it);
 
 		if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) {
@@ -876,20 +1005,22 @@ timer_lwpexit(void)
 void
 timer_lwpbind()
 {
-	timer_t i;
+	uint_t i;
 	proc_t *p = curproc;
 	klwp_t *lwp = ttolwp(curthread);
-	itimer_t *it, **itp;
+	itimer_t *it;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
-	if ((itp = p->p_itimer) == NULL)
+	if (p->p_itimer == NULL) {
 		return;
+	}
 
-	for (i = 0; i < timer_max; i++) {
-		if ((it = itp[i]) == NULL)
+	for (i = 0; i < p->p_itimer_sz; i++) {
+		if ((it = p->p_itimer[i]) == NULL)
 			continue;
 
+		/* This may drop p_lock temporarily. */
 		timer_lock(p, it);
 
 		if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) {
@@ -911,16 +1042,19 @@ timer_lwpbind()
 void
 timer_exit(void)
 {
-	timer_t i;
+	uint_t i;
 	proc_t *p = curproc;
 
 	ASSERT(p->p_itimer != NULL);
+	ASSERT(p->p_itimer_sz != 0);
 
-	for (i = 0; i < timer_max; i++)
-		(void) timer_delete(i);
+	for (i = 0; i < p->p_itimer_sz; i++) {
+		(void) timer_delete((timer_t)i);
+	}
 
-	kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *));
+	kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
 	p->p_itimer = NULL;
+	p->p_itimer_sz = 0;
 }
 
 /*
@@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
 	for (tid = 0; tid < timer_max; tid++) {
 		if ((it = timer_grab(p, tid)) == NULL)
 			continue;
-		if (it->it_portev) {
+		if (it->it_flags & IT_PORT) {
 			mutex_enter(&it->it_mutex);
 			if (it->it_portfd == port) {
 				port_kevent_t *pev;
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 61acc6cf97..53be806026 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv)
 void
 hrt2ts(hrtime_t hrt, timestruc_t *tsp)
 {
+#if defined(__amd64)
+	/*
+	 * The cleverness explained above is unecessary on x86_64 CPUs where
+	 * modern compilers are able to optimize down to faster operations.
+	 */
+	tsp->tv_sec = hrt / NANOSEC;
+	tsp->tv_nsec = hrt % NANOSEC;
+#else
 	uint32_t sec, nsec, tmp;
 
 	tmp = (uint32_t)(hrt >> 30);
@@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
 	}
 	tsp->tv_sec = (time_t)sec;
 	tsp->tv_nsec = nsec;
+#endif /* defined(__amd64) */
 }
 
 /*
  * Convert from timestruc_t to hrtime_t.
- *
- * The code below is equivalent to:
- *
- *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
- *
- * but requires no integer multiply.
  */
 hrtime_t
 ts2hrt(const timestruc_t *tsp)
 {
+#if defined(__amd64) || defined(__i386)
+	/*
+	 * On modern x86 CPUs, the simple version is faster.
+	 */
+	return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec);
+#else
+	/*
+	 * The code below is equivalent to:
+	 *
+	 *	hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec;
+	 *
+	 * but requires no integer multiply.
+	 */
 	hrtime_t hrt;
 
 	hrt = tsp->tv_sec;
@@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
 	hrt = (hrt << 7) - hrt - hrt - hrt;
 	hrt = (hrt << 9) + tsp->tv_nsec;
 	return (hrt);
+#endif /* defined(__amd64) || defined(__i386) */
 }
 
 /*
@@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp)
 void
 hrt2tv(hrtime_t hrt, struct timeval *tvp)
 {
+#if defined(__amd64)
+	/*
+	 * Like hrt2ts, the simple version is faster on x86_64.
+	 */
+	tvp->tv_sec = hrt / NANOSEC;
+	tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC);
+#else
 	uint32_t sec, nsec, tmp;
 	uint32_t q, r, t;
 
@@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp)
 		sec++;
 	}
 	tvp->tv_sec = (time_t)sec;
-/*
- * this routine is very similar to hr2ts, but requires microseconds
- * instead of nanoseconds, so an interger divide by 1000 routine
- * completes the conversion
- */
+	/*
+	 * this routine is very similar to hr2ts, but requires microseconds
+	 * instead of nanoseconds, so an interger divide by 1000 routine
+	 * completes the conversion
+	 */
 	t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12);
 	q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14);
 	q = q >> 9;
 	r = nsec - q*1000;
 	tvp->tv_usec = q + ((r + 24) >> 10);
-
+#endif /* defined(__amd64) */
 }
 
 int
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index 608208bbca..f5ee76a2cb 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -58,6 +59,7 @@
 #include <sys/tnf_probe.h>
 #include <sys/mem_cage.h>
 #include <sys/time.h>
+#include <sys/zone.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -73,7 +75,7 @@ static int checkpage(page_t *, int);
  * algorithm.  They are initialized to 0, and then computed at boot time
  * based on the size of the system.  If they are patched non-zero in
  * a loaded vmunix they are left alone and may thus be changed per system
- * using adb on the loaded system.
+ * using mdb on the loaded system.
  */
 pgcnt_t		slowscan = 0;
 pgcnt_t		fastscan = 0;
@@ -81,6 +83,7 @@ pgcnt_t		fastscan = 0;
 static pgcnt_t	handspreadpages = 0;
 static int	loopfraction = 2;
 static pgcnt_t	looppages;
+/* See comment below describing 4% and 80% */
 static int	min_percent_cpu = 4;
 static int	max_percent_cpu = 80;
 static pgcnt_t	maxfastscan = 0;
@@ -98,14 +101,34 @@ pgcnt_t	deficit;
 pgcnt_t	nscan;
 pgcnt_t	desscan;
 
+/* kstats */
+uint64_t low_mem_scan;
+uint64_t zone_cap_scan;
+uint64_t n_throttle;
+
+clock_t	zone_pageout_ticks;	/* tunable to change zone pagescan ticks */
+
 /*
  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
  * are the number of ticks in each wakeup cycle that gives the
  * equivalent of some underlying %CPU duty cycle.
- * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
- * awakened every 25 clock ticks.  So, converting from %CPU to ticks
- * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
- * So, for example, 4% == 1 tick and 80% == 20 ticks.
+ *
+ * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
+ * will run 4 times/sec to update pageout scanning parameters and kickoff
+ * the pageout_scanner() thread if necessary.
+ *
+ * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
+ * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
+ *
+ * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
+ * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
+ * by the scanner in a 1 second interval is 80% of a CPU
+ * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
+ * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
+ *
+ * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
+ * will be 200, so the CPU percentages are the same as when hz is 100.
  *
  * min_pageout_ticks:
  *     ticks/wakeup equivalent of min_percent_cpu.
@@ -117,19 +140,29 @@ pgcnt_t	desscan;
  *     Number of clock ticks budgeted for each wakeup cycle.
  *     Computed each time around by schedpaging().
  *     Varies between min_pageout_ticks .. max_pageout_ticks,
- *     depending on memory pressure.
- *
- * pageout_lbolt:
- *     Timestamp of the last time pageout_scanner woke up and started
- *     (or resumed) scanning for not recently referenced pages.
+ *     depending on memory pressure or zones over their cap.
  */
 
 static clock_t	min_pageout_ticks;
 static clock_t	max_pageout_ticks;
 static clock_t	pageout_ticks;
-static clock_t	pageout_lbolt;
 
-static uint_t	reset_hands;
+#define	MAX_PSCAN_THREADS	16
+static boolean_t reset_hands[MAX_PSCAN_THREADS];
+
+/*
+ * These can be tuned in /etc/system or set with mdb.
+ * 'des_page_scanners' is the desired number of page scanner threads. The
+ * system will bring the actual number of threads into line with the desired
+ * number. If des_page_scanners is set to an invalid value, the system will
+ * correct the setting.
+ */
+uint_t des_page_scanners;
+uint_t pageout_reset_cnt = 64;	/* num. cycles for pageout_scanner hand reset */
+
+uint_t n_page_scanners;
+static pgcnt_t	pscan_region_sz; /* informational only */
+
 
 #define	PAGES_POLL_MASK	1023
 
@@ -145,33 +178,37 @@ static uint_t	reset_hands;
  * pageout_sample_pages:
  *     The accumulated number of pages scanned during sampling.
  *
- * pageout_sample_ticks:
- *     The accumulated clock ticks for the sample.
+ * pageout_sample_etime:
+ *     The accumulated number of nanoseconds for the sample.
  *
  * pageout_rate:
- *     Rate in pages/nanosecond, computed at the end of sampling.
+ *     Rate in pages/second, computed at the end of sampling.
  *
  * pageout_new_spread:
- *     The new value to use for fastscan and handspreadpages.
- *     Calculated after enough samples have been taken.
+ *     The new value to use for maxfastscan and (perhaps) handspreadpages.
+ *     Intended to be the number pages that can be scanned per sec using ~10%
+ *     of a CPU. Calculated after enough samples have been taken.
+ *     pageout_rate / 10
  */
 
 typedef hrtime_t hrrate_t;
 
-static uint64_t	pageout_sample_lim = 4;
-static uint64_t	pageout_sample_cnt = 0;
+static uint_t	pageout_sample_lim = 4;
+static uint_t	pageout_sample_cnt = 0;
 static pgcnt_t	pageout_sample_pages = 0;
 static hrrate_t	pageout_rate = 0;
 static pgcnt_t	pageout_new_spread = 0;
 
-static clock_t	pageout_cycle_ticks;
-static hrtime_t	sample_start, sample_end;
 static hrtime_t	pageout_sample_etime = 0;
 
+/* True if page scanner is first starting up */
+#define	PAGE_SCAN_STARTUP	(pageout_sample_cnt < pageout_sample_lim)
+
 /*
  * Record number of times a pageout_scanner wakeup cycle finished because it
  * timed out (exceeded its CPU budget), rather than because it visited
- * its budgeted number of pages.
+ * its budgeted number of pages. This is only done when scanning under low
+ * free memory conditions, not when scanning for zones over their cap.
  */
 uint64_t pageout_timeouts = 0;
 
@@ -194,25 +231,35 @@ kcondvar_t	memavail_cv;
 #define	LOOPPAGES	total_pages
 
 /*
- * Set up the paging constants for the clock algorithm.
- * Called after the system is initialized and the amount of memory
- * and number of paging devices is known.
+ * Local boolean to control scanning when zones are over their cap. Avoids
+ * accessing the zone_num_over_cap variable except within schedpaging(), which
+ * only runs periodically. This is here only to reduce our access to
+ * zone_num_over_cap, since it is already accessed a lot during paging, and
+ * the page scanner accesses the zones_over variable on each page during a
+ * scan. There is no lock needed for zone_num_over_cap since schedpaging()
+ * doesn't modify the variable, it only cares if the variable is 0 or non-0.
+ */
+static boolean_t zones_over = B_FALSE;
+
+/*
+ * Set up the paging constants for the page scanner clock-hand algorithm.
+ * Called at startup after the system is initialized and the amount of memory
+ * and number of paging devices is known (recalc will be 0). Called again once
+ * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
+ * (recalc will be 1).
+ *
+ * Will also be called after a memory dynamic reconfiguration operation and
+ * recalc will be 1 in those cases too.
  *
- * lotsfree is 1/64 of memory, but at least 512K.
+ * lotsfree is 1/64 of memory, but at least 512K (ha!).
  * desfree is 1/2 of lotsfree.
  * minfree is 1/2 of desfree.
- *
- * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
- *
- *	lotsfree = btop(512K)
- *	desfree = btop(200K)
- *	minfree = btop(100K)
- *	throttlefree = INT_MIN
- *	max_percent_cpu = 4
  */
 void
 setupclock(int recalc)
 {
+	uint_t i;
+	pgcnt_t sz, tmp;
 
 	static spgcnt_t init_lfree, init_dfree, init_mfree;
 	static spgcnt_t init_tfree, init_preserve, init_mpgio;
@@ -221,8 +268,8 @@ setupclock(int recalc)
 	looppages = LOOPPAGES;
 
 	/*
-	 * setupclock can now be called to recalculate the paging
-	 * parameters in the case of dynamic addition of memory.
+	 * setupclock can be called to recalculate the paging
+	 * parameters in the case of dynamic reconfiguration of memory.
 	 * So to make sure we make the proper calculations, if such a
 	 * situation should arise, we save away the initial values
 	 * of each parameter so we can recall them when needed. This
@@ -311,105 +358,98 @@ setupclock(int recalc)
 		maxpgio = init_mpgio;
 
 	/*
-	 * The clock scan rate varies between fastscan and slowscan
-	 * based on the amount of free memory available.  Fastscan
-	 * rate should be set based on the number pages that can be
-	 * scanned per sec using ~10% of processor time.  Since this
-	 * value depends on the processor, MMU, Mhz etc., it is
-	 * difficult to determine it in a generic manner for all
-	 * architectures.
+	 * When the system is in a low memory state, the page scan rate varies
+	 * between fastscan and slowscan based on the amount of free memory
+	 * available. When only zones are over their memory cap, the scan rate
+	 * is always fastscan.
 	 *
-	 * Instead of trying to determine the number of pages scanned
-	 * per sec for every processor, fastscan is set to be the smaller
-	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
-	 * time is limited to ~4% of processor time.
+	 * The fastscan rate should be set based on the number pages that can
+	 * be scanned per sec using ~10% of a CPU. Since this value depends on
+	 * the processor, MMU, Ghz etc., it must be determined dynamically.
 	 *
-	 * Setting fastscan to be 1/2 of memory allows pageout to scan
-	 * all of memory in ~2 secs.  This implies that user pages not
-	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
-	 * can be reclaimed when free memory is very low.  Stealing pages
-	 * not accessed within 1 sec seems reasonable and ensures that
-	 * active user processes don't thrash.
+	 * When the scanner first starts up, fastscan will be set to 0 and
+	 * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
+	 * However, once the scanner has collected enough samples, then fastscan
+	 * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
+	 * or maxfastscan (which is set from pageout_new_spread). Thus,
+	 * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
+	 * initialized.
 	 *
-	 * Smaller values of fastscan result in scanning fewer pages
-	 * every second and consequently pageout may not be able to free
-	 * sufficient memory to maintain the minimum threshold.  Larger
-	 * values of fastscan result in scanning a lot more pages which
-	 * could lead to thrashing and higher CPU usage.
+	 * pageout_new_spread is calculated when the scanner first starts
+	 * running. During this initial sampling period the nscan_limit
+	 * is set to the total_pages of system memory. Thus, the scanner could
+	 * theoretically scan all of memory in one pass. However, each sample
+	 * is also limited by the %CPU budget. This is controlled by
+	 * pageout_ticks which is set in schedpaging(). During the sampling
+	 * period, pageout_ticks is set to max_pageout_ticks. This tick value
+	 * is derived from the max_percent_cpu (80%) described above. On a
+	 * system with more than a small amount of memory (~8GB), the scanner's
+	 * %CPU will be the limiting factor in calculating pageout_new_spread.
 	 *
-	 * Fastscan needs to be limited to a maximum value and should not
-	 * scale with memory to prevent pageout from consuming too much
-	 * time for scanning on slow CPU's and avoid thrashing, as a
-	 * result of scanning too many pages, on faster CPU's.
-	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
-	 * (the upper bound for fastscan) based on the average number
-	 * of pages that can potentially be scanned in ~1 sec (using ~4%
-	 * of the CPU) on some of the following machines that currently
-	 * run Solaris 2.x:
+	 * At the end of the sampling period, the pageout_rate indicates how
+	 * many pages could be scanned per second. The pageout_new_spread is
+	 * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
+	 * Of course, this value could still be more than the physical memory
+	 * on the system. If so, fastscan is set to 1/2 of memory, as
+	 * mentioned above.
 	 *
-	 *			average memory scanned in ~1 sec
+	 * All of this leads up to the setting of handspreadpages, which is
+	 * set to fastscan. This is the distance, in pages, between the front
+	 * and back hands during scanning. It will dictate which pages will
+	 * be considered "hot" on the backhand and which pages will be "cold"
+	 * and reclaimed
 	 *
-	 *	25 Mhz SS1+:		23 Meg
-	 *	LX:			37 Meg
-	 *	50 Mhz SC2000:		68 Meg
+	 * If the scanner is limited by desscan, then at the highest rate it
+	 * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
+	 * scanner is limited by the %CPU, then at the highest rate (20% of a
+	 * CPU per cycle) the number of pages scanned could be much less.
 	 *
-	 *	40 Mhz 486:		26 Meg
-	 *	66 Mhz 486:		42 Meg
+	 * Thus, if the scanner is limited by desscan, then the handspreadpages
+	 * setting means 1sec between the front and back hands, but if the
+	 * scanner is limited by %CPU, it could be several seconds between the
+	 * two hands.
 	 *
-	 * When free memory falls just below lotsfree, the scan rate
-	 * goes from 0 to slowscan (i.e., pageout starts running).  This
+	 * The basic assumption is that at the worst case, stealing pages
+	 * not accessed within 1 sec seems reasonable and ensures that active
+	 * user processes don't thrash. This is especially true when the system
+	 * is in a low memory state.
+	 *
+	 * There are some additional factors to consider for the case of
+	 * scanning when zones are over their cap. In this situation it is
+	 * also likely that the machine will have a large physical memory which
+	 * will take many seconds to fully scan (due to the %CPU and desscan
+	 * limits per cycle). It is probable that there will be few (or 0)
+	 * pages attributed to these zones in any single scanning cycle. The
+	 * result is that reclaiming enough pages for these zones might take
+	 * several additional seconds (this is generally not a problem since
+	 * the zone physical cap is just a soft cap).
+	 *
+	 * This is similar to the typical multi-processor situation in which
+	 * pageout is often unable to maintain the minimum paging thresholds
+	 * under heavy load due to the fact that user processes running on
+	 * other CPU's can be dirtying memory at a much faster pace than
+	 * pageout can find pages to free.
+	 *
+	 * One potential approach to address both of these cases is to enable
+	 * more than one CPU to run the page scanner, in such a manner that the
+	 * various clock hands don't overlap. However, this also makes it more
+	 * difficult to determine the values for fastscan, slowscan and
+	 * handspreadpages. This is left as a future enhancement, if necessary.
+	 *
+	 * When free memory falls just below lotsfree, the scan rate goes from
+	 * 0 to slowscan (i.e., the page scanner starts running).  This
 	 * transition needs to be smooth and is achieved by ensuring that
 	 * pageout scans a small number of pages to satisfy the transient
 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
 	 * wakeup) since scanning that many pages has no noticible impact
 	 * on system performance.
 	 *
-	 * In addition to setting fastscan and slowscan, pageout is
-	 * limited to using ~4% of the CPU.  This results in increasing
-	 * the time taken to scan all of memory, which in turn means that
-	 * user processes have a better opportunity of preventing their
-	 * pages from being stolen.  This has a positive effect on
-	 * interactive and overall system performance when memory demand
-	 * is high.
-	 *
-	 * Thus, the rate at which pages are scanned for replacement will
-	 * vary linearly between slowscan and the number of pages that
-	 * can be scanned using ~4% of processor time instead of varying
-	 * linearly between slowscan and fastscan.
-	 *
-	 * Also, the processor time used by pageout will vary from ~1%
-	 * at slowscan to ~4% at fastscan instead of varying between
-	 * ~1% at slowscan and ~10% at fastscan.
-	 *
-	 * The values chosen for the various VM parameters (fastscan,
-	 * handspreadpages, etc) are not universally true for all machines,
-	 * but appear to be a good rule of thumb for the machines we've
-	 * tested.  They have the following ranges:
-	 *
-	 *	cpu speed:	20 to 70 Mhz
-	 *	page size:	4K to 8K
-	 *	memory size:	16M to 5G
-	 *	page scan rate:	4000 - 17400 4K pages per sec
-	 *
-	 * The values need to be re-examined for machines which don't
-	 * fall into the various ranges (e.g., slower or faster CPUs,
-	 * smaller or larger pagesizes etc) shown above.
-	 *
-	 * On an MP machine, pageout is often unable to maintain the
-	 * minimum paging thresholds under heavy load.  This is due to
-	 * the fact that user processes running on other CPU's can be
-	 * dirtying memory at a much faster pace than pageout can find
-	 * pages to free.  The memory demands could be met by enabling
-	 * more than one CPU to run the clock algorithm in such a manner
-	 * that the various clock hands don't overlap.  This also makes
-	 * it more difficult to determine the values for fastscan, slowscan
-	 * and handspreadpages.
-	 *
-	 * The swapper is currently used to free up memory when pageout
-	 * is unable to meet memory demands by swapping out processes.
-	 * In addition to freeing up memory, swapping also reduces the
-	 * demand for memory by preventing user processes from running
-	 * and thereby consuming memory.
+	 * The swapper is currently used to free up memory when pageout is
+	 * unable to meet memory demands. It does this by swapping out entire
+	 * processes. In addition to freeing up memory, swapping also reduces
+	 * the demand for memory because the swapped out processes cannot
+	 * run, and thereby consume memory. However, this is a pathological
+	 * state and performance will generally be considered unacceptable.
 	 */
 	if (init_mfscan == 0) {
 		if (pageout_new_spread != 0)
@@ -419,12 +459,13 @@ setupclock(int recalc)
 	} else {
 		maxfastscan = init_mfscan;
 	}
-	if (init_fscan == 0)
+	if (init_fscan == 0) {
 		fastscan = MIN(looppages / loopfraction, maxfastscan);
-	else
+	} else {
 		fastscan = init_fscan;
-	if (fastscan > looppages / loopfraction)
-		fastscan = looppages / loopfraction;
+		if (fastscan > looppages / loopfraction)
+			fastscan = looppages / loopfraction;
+	}
 
 	/*
 	 * Set slow scan time to 1/10 the fast scan time, but
@@ -444,12 +485,10 @@ setupclock(int recalc)
 	 * decreases as the scan rate rises. It must be < the amount
 	 * of pageable memory.
 	 *
-	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
-	 * to be "fastscan" results in the front hand being a few secs
-	 * (varies based on the processor speed) ahead of the back hand
-	 * at fastscan rates.  This distance can be further reduced, if
-	 * necessary, by increasing the processor time used by pageout
-	 * to be more than ~4% and preferrably not more than ~10%.
+	 * Since pageout is limited to the %CPU per cycle, setting
+	 * handspreadpages to be "fastscan" results in the front hand being
+	 * a few secs (varies based on the processor speed) ahead of the back
+	 * hand at fastscan rates.
 	 *
 	 * As a result, user processes have a much better chance of
 	 * referencing their pages before the back hand examines them.
@@ -471,29 +510,78 @@ setupclock(int recalc)
 	if (handspreadpages >= looppages)
 		handspreadpages = looppages - 1;
 
+	if (recalc == 0) {
+		/*
+		 * Setup basic values at initialization.
+		 */
+		pscan_region_sz = total_pages;
+		des_page_scanners = n_page_scanners = 1;
+		reset_hands[0] = B_TRUE;
+		return;
+	}
+
 	/*
-	 * If we have been called to recalculate the parameters,
-	 * set a flag to re-evaluate the clock hand pointers.
+	 * Recalculating
+	 *
+	 * We originally set the number of page scanners to 1. Now that we
+	 * know what the handspreadpages is for a scanner, figure out how many
+	 * scanners we should run. We want to ensure that the regions don't
+	 * overlap and that they are not touching.
+	 *
+	 * A default 64GB region size is used as the initial value to calculate
+	 * how many scanner threads we should create on lower memory systems.
+	 * The idea is to limit the number of threads to a practical value
+	 * (e.g. a 64GB machine really only needs one scanner thread). For very
+	 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
+	 * threads.
+	 *
+	 * The scanner threads themselves are evenly spread out around the
+	 * memory "clock" in pageout_scanner when we reset the hands, and each
+	 * thread will scan all of memory.
 	 */
-	if (recalc)
-		reset_hands = 1;
+	sz = (btop(64ULL * 0x40000000ULL));
+	if (sz < handspreadpages) {
+		/*
+		 * 64GB is smaller than the separation between the front
+		 * and back hands; use double handspreadpages.
+		 */
+		sz = handspreadpages << 1;
+	}
+	if (sz > total_pages) {
+		sz = total_pages;
+	}
+	/* Record region size for inspection with mdb, otherwise unused */
+	pscan_region_sz = sz;
+
+	tmp = sz;
+	for (i = 1; tmp < total_pages; i++) {
+		tmp += sz;
+	}
+
+	if (i > MAX_PSCAN_THREADS)
+		i = MAX_PSCAN_THREADS;
+
+	des_page_scanners = i;
 }
 
 /*
  * Pageout scheduling.
  *
  * Schedpaging controls the rate at which the page out daemon runs by
- * setting the global variables nscan and desscan RATETOSCHEDPAGING
- * times a second.  Nscan records the number of pages pageout has examined
- * in its current pass; schedpaging resets this value to zero each time
- * it runs.  Desscan records the number of pages pageout should examine
- * in its next pass; schedpaging sets this value based on the amount of
- * currently available memory.
+ * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
+ * times a second. The pageout_ticks variable controls the percent of one
+ * CPU that each page scanner thread should consume (see min_percent_cpu
+ * and max_percent_cpu descriptions). The desscan variable records the number
+ * of pages pageout should examine in its next pass; schedpaging sets this
+ * value based on the amount of currently available memory. In addtition, the
+ * nscan variable records the number of pages pageout has examined in its
+ * current pass; schedpaging resets this value to zero each time it runs.
  */
 
-#define	RATETOSCHEDPAGING	4		/* hz that is */
+#define	RATETOSCHEDPAGING	4		/* times/second */
 
-static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
+/* held while pageout_scanner or schedpaging are modifying shared data */
+static kmutex_t	pageout_mutex;
 
 /*
  * Pool of available async pageout putpage requests.
@@ -506,7 +594,7 @@ static kcondvar_t push_cv;
 
 static int async_list_size = 256;	/* number of async request structs */
 
-static void pageout_scanner(void);
+static void pageout_scanner(void *);
 
 /*
  * If a page is being shared more than "po_share" times
@@ -535,67 +623,153 @@ schedpaging(void *arg)
 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 		kcage_cageout_wakeup();
 
-	if (mutex_tryenter(&pageout_mutex)) {
-		/* pageout() not running */
-		nscan = 0;
-		vavail = freemem - deficit;
-		if (pageout_new_spread != 0)
-			vavail -= needfree;
-		if (vavail < 0)
-			vavail = 0;
-		if (vavail > lotsfree)
-			vavail = lotsfree;
+	(void) atomic_swap_ulong(&nscan, 0);
+	vavail = freemem - deficit;
+	if (pageout_new_spread != 0)
+		vavail -= needfree;
+	if (vavail < 0)
+		vavail = 0;
+	if (vavail > lotsfree)
+		vavail = lotsfree;
 
+	/*
+	 * Fix for 1161438 (CRS SPR# 73922).  All variables
+	 * in the original calculation for desscan were 32 bit signed
+	 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
+	 * more of memory, the calculation can overflow.  When this
+	 * happens, desscan becomes negative and pageout_scanner()
+	 * stops paging out.
+	 */
+	if ((needfree) && (pageout_new_spread == 0)) {
 		/*
-		 * Fix for 1161438 (CRS SPR# 73922).  All variables
-		 * in the original calculation for desscan were 32 bit signed
-		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
-		 * more of memory, the calculation can overflow.  When this
-		 * happens, desscan becomes negative and pageout_scanner()
-		 * stops paging out.
+		 * If we've not yet collected enough samples to
+		 * calculate a spread, kick into high gear anytime
+		 * needfree is non-zero. Note that desscan will not be
+		 * the limiting factor for systems with larger memory;
+		 * the %CPU will limit the scan. That will also be
+		 * maxed out below.
 		 */
-		if ((needfree) && (pageout_new_spread == 0)) {
-			/*
-			 * If we've not yet collected enough samples to
-			 * calculate a spread, use the old logic of kicking
-			 * into high gear anytime needfree is non-zero.
-			 */
-			desscan = fastscan / RATETOSCHEDPAGING;
-		} else {
-			/*
-			 * Once we've calculated a spread based on system
-			 * memory and usage, just treat needfree as another
-			 * form of deficit.
-			 */
-			spgcnt_t faststmp, slowstmp, result;
+		desscan = fastscan / RATETOSCHEDPAGING;
+	} else {
+		/*
+		 * Once we've calculated a spread based on system
+		 * memory and usage, just treat needfree as another
+		 * form of deficit.
+		 */
+		spgcnt_t faststmp, slowstmp, result;
+
+		slowstmp = slowscan * vavail;
+		faststmp = fastscan * (lotsfree - vavail);
+		result = (slowstmp + faststmp) /
+		    nz(lotsfree) / RATETOSCHEDPAGING;
+		desscan = (pgcnt_t)result;
+	}
+
+	/*
+	 * If we've not yet collected enough samples to calculate a
+	 * spread, also kick %CPU to the max.
+	 */
+	if (pageout_new_spread == 0) {
+		pageout_ticks = max_pageout_ticks;
+	} else {
+		pageout_ticks = min_pageout_ticks +
+		    (lotsfree - vavail) *
+		    (max_pageout_ticks - min_pageout_ticks) /
+		    nz(lotsfree);
+	}
 
-			slowstmp = slowscan * vavail;
-			faststmp = fastscan * (lotsfree - vavail);
-			result = (slowstmp + faststmp) /
-			    nz(lotsfree) / RATETOSCHEDPAGING;
-			desscan = (pgcnt_t)result;
+	if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
+		/*
+		 * We have finished the pagescan initialization and the desired
+		 * number of page scanners has changed, either because
+		 * initialization just finished, because of a memory DR, or
+		 * because des_page_scanners has been modified on the fly (i.e.
+		 * by mdb). If we need more scanners, start them now, otherwise
+		 * the excess scanners will terminate on their own when they
+		 * reset their hands.
+		 */
+		uint_t i;
+		uint_t curr_nscan = n_page_scanners;
+		pgcnt_t max = total_pages / handspreadpages;
+
+		if (des_page_scanners > max)
+			des_page_scanners = max;
+
+		if (des_page_scanners > MAX_PSCAN_THREADS) {
+			des_page_scanners = MAX_PSCAN_THREADS;
+		} else if (des_page_scanners == 0) {
+			des_page_scanners = 1;
 		}
 
-		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
-		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
+		/*
+		 * Each thread has its own entry in the reset_hands array, so
+		 * we don't need any locking in pageout_scanner to check the
+		 * thread's reset_hands entry. Thus, we use a pre-allocated
+		 * fixed size reset_hands array and upper limit on the number
+		 * of pagescan threads.
+		 *
+		 * The reset_hands entries need to be true before we start new
+		 * scanners, but if we're reducing, we don't want a race on the
+		 * recalculation for the existing threads, so we set
+		 * n_page_scanners first.
+		 */
+		n_page_scanners = des_page_scanners;
+		for (i = 0; i < MAX_PSCAN_THREADS; i++) {
+			reset_hands[i] = B_TRUE;
+		}
 
-		if (freemem < lotsfree + needfree ||
-		    pageout_sample_cnt < pageout_sample_lim) {
-			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
-			    "pageout_cv_signal:freemem %ld", freemem);
-			cv_signal(&proc_pageout->p_cv);
-		} else {
-			/*
-			 * There are enough free pages, no need to
-			 * kick the scanner thread.  And next time
-			 * around, keep more of the `highly shared'
-			 * pages.
-			 */
-			cv_signal_pageout();
-			if (po_share > MIN_PO_SHARE) {
-				po_share >>= 1;
+		if (des_page_scanners > curr_nscan) {
+			/* Create additional pageout scanner threads. */
+			for (i = curr_nscan; i < des_page_scanners; i++) {
+				(void) lwp_kernel_create(proc_pageout,
+				    pageout_scanner, (void *)(uintptr_t)i,
+				    TS_RUN, curthread->t_pri);
 			}
 		}
+	}
+
+	zones_over = B_FALSE;
+
+	if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
+		if (!PAGE_SCAN_STARTUP)
+			low_mem_scan++;
+		DTRACE_PROBE(schedpage__wake__low);
+		WAKE_PAGEOUT_SCANNER();
+
+	} else if (zone_num_over_cap > 0) {
+		/* One or more zones are over their cap. */
+
+		/* No page limit */
+		desscan = total_pages;
+
+		/*
+		 * Increase the scanning CPU% to the max. This implies
+		 * 80% of one CPU/sec if the scanner can run each
+		 * opportunity. Can also be tuned via setting
+		 * zone_pageout_ticks in /etc/system or with mdb.
+		 */
+		pageout_ticks = (zone_pageout_ticks != 0) ?
+		    zone_pageout_ticks : max_pageout_ticks;
+
+		zones_over = B_TRUE;
+		zone_cap_scan++;
+
+		DTRACE_PROBE(schedpage__wake__zone);
+		WAKE_PAGEOUT_SCANNER();
+
+	} else {
+		/*
+		 * There are enough free pages, no need to
+		 * kick the scanner thread.  And next time
+		 * around, keep more of the `highly shared'
+		 * pages.
+		 */
+		cv_signal_pageout();
+
+		mutex_enter(&pageout_mutex);
+		if (po_share > MIN_PO_SHARE) {
+			po_share >>= 1;
+		}
 		mutex_exit(&pageout_mutex);
 	}
 
@@ -617,36 +791,46 @@ ulong_t		push_list_size;		/* # of requests on pageout queue */
 #define	FRONT	1
 #define	BACK	2
 
-int dopageout = 1;	/* must be non-zero to turn page stealing on */
+int dopageout = 1;	/* /etc/system tunable to disable page reclamation */
 
 /*
  * The page out daemon, which runs as process 2.
  *
- * As long as there are at least lotsfree pages,
- * this process is not run.  When the number of free
- * pages stays in the range desfree to lotsfree,
- * this daemon runs through the pages in the loop
- * at a rate determined in schedpaging().  Pageout manages
- * two hands on the clock.  The front hand moves through
- * memory, clearing the reference bit,
- * and stealing pages from procs that are over maxrss.
- * The back hand travels a distance behind the front hand,
- * freeing the pages that have not been referenced in the time
- * since the front hand passed.  If modified, they are pushed to
- * swap before being freed.
+ * Page out occurs when either:
+ * a) there is less than lotsfree pages,
+ * b) there are one or more zones over their physical memory cap.
+ *
+ * The daemon treats physical memory as a circular array of pages and scans the
+ * pages using a 'two-handed clock' algorithm. The front hand moves through
+ * the pages, clearing the reference bit. The back hand travels a distance
+ * (handspreadpages) behind the front hand, freeing the pages that have not
+ * been referenced in the time since the front hand passed. If modified, they
+ * are first written to their backing store before being freed.
+ *
+ * In order to make page invalidation more responsive on machines with larger
+ * memory, multiple pageout_scanner threads may be created. In this case, the
+ * threads are evenly distributed around the the memory "clock face" so that
+ * memory can be reclaimed more quickly (that is, there can be large regions in
+ * which no pages can be reclaimed by a single thread, leading to lag which
+ * causes undesirable behavior such as htable stealing).
+ *
+ * As long as there are at least lotsfree pages, or no zones over their cap,
+ * then pageout_scanner threads are not run. When pageout_scanner threads are
+ * running for case (a), all pages are considered for pageout. For case (b),
+ * only pages belonging to a zone over its cap will be considered for pageout.
  *
- * There are 2 threads that act on behalf of the pageout process.
- * One thread scans pages (pageout_scanner) and frees them up if
+ * There are multiple threads that act on behalf of the pageout process.
+ * A set of threads scan pages (pageout_scanner) and frees them up if
  * they don't require any VOP_PUTPAGE operation. If a page must be
  * written back to its backing store, the request is put on a list
  * and the other (pageout) thread is signaled. The pageout thread
  * grabs VOP_PUTPAGE requests from the list, and processes them.
  * Some filesystems may require resources for the VOP_PUTPAGE
  * operations (like memory) and hence can block the pageout
- * thread, but the scanner thread can still operate. There is still
+ * thread, but the pageout_scanner threads can still operate. There is still
  * no guarantee that memory deadlocks cannot occur.
  *
- * For now, this thing is in very rough form.
+ * The pageout_scanner parameters are determined in schedpaging().
  */
 void
 pageout()
@@ -684,9 +868,9 @@ pageout()
 
 	pageout_pri = curthread->t_pri;
 
-	/* Create the pageout scanner thread. */
-	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
-	    pageout_pri - 1);
+	/* Create the (first) pageout scanner thread. */
+	(void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
+	    TS_RUN, pageout_pri - 1);
 
 	/*
 	 * kick off pageout scheduler.
@@ -720,6 +904,7 @@ pageout()
 		arg->a_next = NULL;
 		mutex_exit(&push_lock);
 
+		DTRACE_PROBE(pageout__push);
 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 			pushes++;
@@ -740,32 +925,24 @@ pageout()
  * Kernel thread that scans pages looking for ones to free
  */
 static void
-pageout_scanner(void)
+pageout_scanner(void *a)
 {
 	struct page *fronthand, *backhand;
-	uint_t count;
+	uint_t count, iter = 0;
 	callb_cpr_t cprinfo;
-	pgcnt_t	nscan_limit;
+	pgcnt_t	nscan_cnt, nscan_limit;
 	pgcnt_t	pcount;
+	uint_t inst = (uint_t)(uintptr_t)a;
+	hrtime_t sample_start, sample_end;
+	clock_t pageout_lbolt;
+	kmutex_t pscan_mutex;
 
-	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
-	mutex_enter(&pageout_mutex);
+	VERIFY3U(inst, <, MAX_PSCAN_THREADS);
 
-	/*
-	 * The restart case does not attempt to point the hands at roughly
-	 * the right point on the assumption that after one circuit things
-	 * will have settled down - and restarts shouldn't be that often.
-	 */
+	mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
 
-	/*
-	 * Set the two clock hands to be separated by a reasonable amount,
-	 * but no more than 360 degrees apart.
-	 */
-	backhand = page_first();
-	if (handspreadpages >= total_pages)
-		fronthand = page_nextn(backhand, total_pages - 1);
-	else
-		fronthand = page_nextn(backhand, handspreadpages);
+	CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
+	mutex_enter(&pscan_mutex);
 
 	min_pageout_ticks = MAX(1,
 	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
@@ -776,71 +953,116 @@ loop:
 	cv_signal_pageout();
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
-	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
-	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+	cv_wait(&proc_pageout->p_cv, &pscan_mutex);
+	CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
 
 	if (!dopageout)
 		goto loop;
 
-	if (reset_hands) {
-		reset_hands = 0;
+	if (reset_hands[inst]) {
+		struct page *first;
+		pgcnt_t offset = total_pages / n_page_scanners;
 
-		backhand = page_first();
-		if (handspreadpages >= total_pages)
+		reset_hands[inst] = B_FALSE;
+		if (inst >= n_page_scanners) {
+			/*
+			 * The desired number of page scanners has been
+			 * reduced and this instance is no longer wanted.
+			 * Exit the lwp.
+			 */
+			VERIFY3U(inst, !=, 0);
+			mutex_exit(&pscan_mutex);
+			mutex_enter(&curproc->p_lock);
+			lwp_exit();
+		}
+
+		/*
+		 * The reset case repositions the hands at the proper place
+		 * on the memory clock face to prevent creep into another
+		 * thread's active region or when the number of threads has
+		 * changed.
+		 *
+		 * Set the two clock hands to be separated by a reasonable
+		 * amount, but no more than 360 degrees apart.
+		 *
+		 * If inst == 0, backhand starts at first page, otherwise
+		 * it is (inst * offset) around the memory "clock face" so that
+		 * we spread out each scanner instance evenly.
+		 */
+		first = page_first();
+		backhand = page_nextn(first, offset * inst);
+		if (handspreadpages >= total_pages) {
 			fronthand = page_nextn(backhand, total_pages - 1);
-		else
+		} else {
 			fronthand = page_nextn(backhand, handspreadpages);
+		}
 	}
 
+	/*
+	 * This CPU kstat is only incremented here and we're obviously on this
+	 * CPU, so no lock.
+	 */
 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 	count = 0;
 
-	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
-	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
-	    freemem, lotsfree, nscan, desscan);
-
 	/* Kernel probe */
 	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
 	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
 
 	pcount = 0;
-	if (pageout_sample_cnt < pageout_sample_lim) {
+	nscan_cnt = 0;
+	if (PAGE_SCAN_STARTUP) {
 		nscan_limit = total_pages;
 	} else {
 		nscan_limit = desscan;
 	}
+
+	DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
+	    page_t *, backhand, page_t *, fronthand);
+
 	pageout_lbolt = ddi_get_lbolt();
 	sample_start = gethrtime();
 
 	/*
 	 * Scan the appropriate number of pages for a single duty cycle.
-	 * However, stop scanning as soon as there is enough free memory.
-	 * For a short while, we will be sampling the performance of the
-	 * scanner and need to keep running just to get sample data, in
-	 * which case we keep going and don't pay attention to whether
-	 * or not there is enough free memory.
+	 * Only scan while at least one of these is true:
+	 * 1) one or more zones is over its cap
+	 * 2) there is not enough free memory
+	 * 3) during page scan startup when determining sample data
 	 */
-
-	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
-	    pageout_sample_cnt < pageout_sample_lim)) {
+	while (nscan_cnt < nscan_limit &&
+	    (zones_over ||
+	    freemem < lotsfree + needfree ||
+	    PAGE_SCAN_STARTUP)) {
 		int rvfront, rvback;
 
+		DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+
 		/*
 		 * Check to see if we have exceeded our %CPU budget
 		 * for this wakeup, but not on every single page visited,
 		 * just every once in a while.
 		 */
 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
+			clock_t pageout_cycle_ticks;
+
 			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 			if (pageout_cycle_ticks >= pageout_ticks) {
-				++pageout_timeouts;
+				/*
+				 * This is where we normally break out of the
+				 * loop when scanning zones or sampling.
+				 */
+				if (!zones_over) {
+					atomic_inc_64(&pageout_timeouts);
+				}
+				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
 				break;
 			}
 		}
 
 		/*
 		 * If checkpage manages to add a page to the free list,
-		 * we give ourselves another couple of trips around the loop.
+		 * we give ourselves another couple of trips around memory.
 		 */
 		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 			count = 0;
@@ -850,7 +1072,8 @@ loop:
 		++pcount;
 
 		/*
-		 * protected by pageout_mutex instead of cpu_stat_lock
+		 * This CPU kstat is only incremented here and we're obviously
+		 * on this CPU, so no lock.
 		 */
 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
 
@@ -858,7 +1081,7 @@ loop:
 		 * Don't include ineligible pages in the number scanned.
 		 */
 		if (rvfront != -1 || rvback != -1)
-			nscan++;
+			nscan_cnt++;
 
 		backhand = page_next(backhand);
 
@@ -868,56 +1091,89 @@ loop:
 		 */
 
 		if ((fronthand = page_next(fronthand)) == page_first())	{
-			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
-			    "pageout_hand_wrap:freemem %ld whichhand %d",
-			    freemem, FRONT);
+			DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
 
 			/*
-			 * protected by pageout_mutex instead of cpu_stat_lock
+			 * Every 64 wraps we reposition our hands within our
+			 * region to prevent creep into another thread.
+			 */
+			if ((++iter % pageout_reset_cnt) == 0)
+				reset_hands[inst] = B_TRUE;
+
+			/*
+			 * This CPU kstat is only incremented here and we're
+			 * obviously on this CPU, so no lock.
 			 */
 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
-			if (++count > 1) {
+
+			/*
+			 * If scanning because the system is low on memory,
+			 * then when we wraparound memory we want to try to
+			 * reclaim more pages.
+			 * If scanning only because zones are over their cap,
+			 * then wrapping is common and we simply keep going.
+			 */
+			if (freemem < lotsfree + needfree && ++count > 1) {
 				/*
+				 * The system is low on memory.
 				 * Extremely unlikely, but it happens.
-				 * We went around the loop at least once
-				 * and didn't get far enough.
+				 * We went around memory at least once
+				 * and didn't reclaim enough.
 				 * If we are still skipping `highly shared'
 				 * pages, skip fewer of them.  Otherwise,
 				 * give up till the next clock tick.
 				 */
+				mutex_enter(&pageout_mutex);
 				if (po_share < MAX_PO_SHARE) {
 					po_share <<= 1;
+					mutex_exit(&pageout_mutex);
 				} else {
 					/*
-					 * Really a "goto loop", but
-					 * if someone is TRACing or
-					 * TNF_PROBE_ing, at least
-					 * make records to show
-					 * where we are.
+					 * Really a "goto loop", but if someone
+					 * is tracing or TNF_PROBE_ing, hit
+					 * those probes first.
 					 */
+					mutex_exit(&pageout_mutex);
 					break;
 				}
 			}
 		}
 	}
 
+	atomic_add_long(&nscan, nscan_cnt);
+
 	sample_end = gethrtime();
 
-	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
-	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
-	    freemem, lotsfree, nscan, desscan, count);
+	DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
+	    uint_t, inst);
 
 	/* Kernel probe */
 	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
-	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
+	    tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
+	    freemem);
 
-	if (pageout_sample_cnt < pageout_sample_lim) {
+	/*
+	 * The following two blocks are only relevant when the scanner is
+	 * first started up. After the scanner runs for a while, neither of
+	 * the conditions will ever be true again.
+	 *
+	 * The global variables used below are only modified by this thread and
+	 * only during initial scanning when there is a single page scanner
+	 * thread running. Thus, we don't use any locking.
+	 */
+	if (PAGE_SCAN_STARTUP) {
+		VERIFY3U(inst, ==, 0);
 		pageout_sample_pages += pcount;
 		pageout_sample_etime += sample_end - sample_start;
 		++pageout_sample_cnt;
-	}
-	if (pageout_sample_cnt >= pageout_sample_lim &&
-	    pageout_new_spread == 0) {
+
+	} else if (pageout_new_spread == 0) {
+		uint_t i;
+
+		/*
+		 * We have run enough samples, set the spread.
+		 */
+		VERIFY3U(inst, ==, 0);
 		pageout_rate = (hrrate_t)pageout_sample_pages *
 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
 		pageout_new_spread = pageout_rate / 10;
@@ -931,9 +1187,8 @@ loop:
  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
  * system (u., page table) or free, then leave it alone.  Otherwise,
  * if we are running the front hand, turn off the page's reference bit.
- * If the proc is over maxrss, we take it.  If running the back hand,
- * check whether the page has been reclaimed.  If not, free the page,
- * pushing it to disk first if necessary.
+ * If running the back hand, check whether the page has been reclaimed.
+ * If not, free the page, pushing it to disk first if necessary.
  *
  * Return values:
  *	-1 if the page is not a candidate at all,
@@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand)
 	int isfs = 0;
 	int isexec = 0;
 	int pagesync_flag;
+	zoneid_t zid = ALL_ZONES;
 
 	/*
 	 * Skip pages:
@@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand)
 		return (-1);
 	}
 
+	if (zones_over) {
+		ASSERT(pp->p_zoneid == ALL_ZONES ||
+		    pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
+		if (pp->p_zoneid == ALL_ZONES ||
+		    zone_pdata[pp->p_zoneid].zpers_over == 0) {
+			/*
+			 * Cross-zone shared page, or zone not over it's cap.
+			 * Leave the page alone.
+			 */
+			page_unlock(pp);
+			return (-1);
+		}
+		zid = pp->p_zoneid;
+	}
+
 	/*
 	 * Maintain statistics for what we are freeing
 	 */
@@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand)
 
 recheck:
 	/*
-	 * If page is referenced; make unreferenced but reclaimable.
-	 * If this page is not referenced, then it must be reclaimable
-	 * and we can add it to the free list.
+	 * If page is referenced; fronthand makes unreferenced and reclaimable.
+	 * For the backhand, a process referenced the page since the front hand
+	 * went by, so it's not a candidate for freeing up.
 	 */
 	if (ppattr & P_REF) {
-		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
-		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
+		DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
 		if (whichhand == FRONT) {
-			/*
-			 * Checking of rss or madvise flags needed here...
-			 *
-			 * If not "well-behaved", fall through into the code
-			 * for not referenced.
-			 */
 			hat_clrref(pp);
 		}
-		/*
-		 * Somebody referenced the page since the front
-		 * hand went by, so it's not a candidate for
-		 * freeing up.
-		 */
 		page_unlock(pp);
 		return (0);
 	}
 
+	/*
+	 * This page is not referenced, so it must be reclaimable and we can
+	 * add it to the free list. This can be done by either hand.
+	 */
+
 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
 
 	/*
@@ -1073,8 +1337,9 @@ recheck:
 		u_offset_t offset = pp->p_offset;
 
 		/*
-		 * XXX - Test for process being swapped out or about to exit?
-		 * [Can't get back to process(es) using the page.]
+		 * Note: There is no possibility to test for process being
+		 * swapped out or about to exit since we can't get back to
+		 * process(es) from the page.
 		 */
 
 		/*
@@ -1092,6 +1357,11 @@ recheck:
 			VN_RELE(vp);
 			return (0);
 		}
+		if (isfs) {
+			zone_pageout_stat(zid, ZPO_DIRTY);
+		} else {
+			zone_pageout_stat(zid, ZPO_ANONDIRTY);
+		}
 		return (1);
 	}
 
@@ -1102,8 +1372,7 @@ recheck:
 	 * the pagesync but before it was unloaded we catch it
 	 * and handle the page properly.
 	 */
-	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
-	    "pageout_free:pp %p whichhand %d", pp, whichhand);
+	DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
@@ -1120,8 +1389,10 @@ recheck:
 		} else {
 			CPU_STATS_ADD_K(vm, fsfree, 1);
 		}
+		zone_pageout_stat(zid, ZPO_FS);
 	} else {
 		CPU_STATS_ADD_K(vm, anonfree, 1);
+		zone_pageout_stat(zid, ZPO_ANON);
 	}
 
 	return (1);		/* freed a page! */
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index c177ecfd75..ad35fd7187 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1627,7 +1627,7 @@ vmem_destroy(vmem_t *vmp)
 
 	leaked = vmem_size(vmp, VMEM_ALLOC);
 	if (leaked != 0)
-		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
 		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
 		    "identifiers" : "bytes");
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index c759f7e010..1db130797c 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  */
@@ -106,14 +106,16 @@
  *   removed from the list of active zones.  zone_destroy() returns, and
  *   the zone can be recreated.
  *
- *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
- *   callbacks are executed, and all memory associated with the zone is
- *   freed.
+ *   ZONE_IS_FREE (internal state): All references have been dropped and
+ *   the zone_t is no longer in the zone_active nor zone_deathrow lists.
+ *   The zone_t is in the process of being freed.  This state exists
+ *   only for publishing a sysevent to indicate that the zone by this
+ *   name can be booted again.
  *
- *   Threads can wait for the zone to enter a requested state by using
- *   zone_status_wait() or zone_status_timedwait() with the desired
- *   state passed in as an argument.  Zone state transitions are
- *   uni-directional; it is not possible to move back to an earlier state.
+ *   Threads can wait for the zone to enter a requested state (other than
+ *   ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
+ *   with the desired state passed in as an argument.  Zone state transitions
+ *   are uni-directional; it is not possible to move back to an earlier state.
  *
  *
  *   Zone-Specific Data:
@@ -252,6 +254,8 @@
 #include <sys/cpucaps.h>
 #include <vm/seg.h>
 #include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
 
 /*
  * This constant specifies the number of seconds that threads waiting for
@@ -312,6 +316,7 @@ static id_space_t *zoneid_space;
  * 'global_zone'.
  */
 zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
 
 /*
@@ -327,8 +332,8 @@ static list_t zone_active;
 static list_t zone_deathrow;
 static kmutex_t zone_deathrow_lock;
 
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
 
 /* Event channel to sent zone state change notifications */
 evchan_t *zone_event_chan;
@@ -350,6 +355,7 @@ const char  *zone_status_table[] = {
 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
 	ZONE_EVENT_UNINITIALIZED,	/* dead */
+	ZONE_EVENT_FREE,		/* free */
 };
 
 /*
@@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = {
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;
 rctl_hndl_t rc_zone_shmmax;
@@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t);
 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 static int zone_set_network(zoneid_t, zone_net_data_t *);
 static int zone_get_network(zoneid_t, zone_net_data_t *);
+static void zone_status_set(zone_t *, zone_status_t);
 
 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 
@@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
  * Version 5 alters the zone_boot system call, and converts its old
  *     bootargs parameter to be set by the zone_setattr API instead.
  * Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
  */
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
+
+/*
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ *   1) pages and RSS data associated with processes inside a zone
+ *   2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ *    associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ *    instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpers_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
+ * the "zone_pdata" array and associated counter.
+ *
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
+ */
+uint_t zone_num_over_cap;
+zone_persist_t zone_pdata[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
 
 /*
  * Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+	rcop_no_action,
+	zone_cpu_base_get,
+	zone_cpu_base_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+	rcop_no_action,
+	zone_cpu_burst_time_get,
+	zone_cpu_burst_time_set,
+	rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+	rctl_qty_t r = 0;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+	mutex_exit(&zp->zpers_zfs_lock);
+
+	return (r);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+	zone_persist_t *zp;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set priority to the new value.
+	 */
+	zp = &zone_pdata[zone->zone_id];
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+	mutex_exit(&zp->zpers_zfs_lock);
+	return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+	rcop_no_action,
+	zone_zfs_io_pri_get,
+	zone_zfs_io_pri_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
 {
 	rctl_qty_t nlwps;
@@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	q = ptob(zp->zpers_pg_cnt);
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zoneid_t zid;
+	uint_t pg_val;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	zid = e->rcep_p.zone->zone_id;
+	if (nv == UINT64_MAX) {
+		pg_val = UINT32_MAX;
+	} else {
+		uint64_t pages = btop(nv);
+
+		/*
+		 * Return from RCTLOP_SET is always ignored so just clamp an
+		 * out-of-range value to our largest "limited" value.
+		 */
+		if (pages >= UINT32_MAX) {
+			pg_val = UINT32_MAX - 1;
+		} else {
+			pg_val = (uint_t)pages;
+		}
+	}
+	zone_pdata[zid].zpers_pg_limit = pg_val;
+	return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+	rcop_no_action,
+	zone_phys_mem_usage,
+	zone_phys_mem_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
@@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
+	return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
 }
 
 static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
     int (*updatefunc) (kstat_t *, int))
 {
 	kstat_t *ksp;
@@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name,
 	return (ksp);
 }
 
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_vfs_kstat_t *zvp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the VFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the slow ops
+	 * counters are updated directly by the VFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zvp->zv_nread.value.ui64 = kiop->nread;
+	zvp->zv_reads.value.ui64 = kiop->reads;
+	zvp->zv_rtime.value.ui64 = kiop->rtime;
+	zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+	zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+	zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+	zvp->zv_writes.value.ui64 = kiop->writes;
+	zvp->zv_wtime.value.ui64 = kiop->wtime;
+	zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+	zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_vfs_kstat_t *zvp;
+
+	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_vfs_lock;
+	zone->zone_vfs_stats = zvp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_vfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_zfs_kstat_t *zzp = ksp->ks_data;
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp == NULL) {
+		zzp->zz_nread.value.ui64 = 0;
+		zzp->zz_reads.value.ui64 = 0;
+		zzp->zz_rtime.value.ui64 = 0;
+		zzp->zz_rlentime.value.ui64 = 0;
+		zzp->zz_nwritten.value.ui64 = 0;
+		zzp->zz_writes.value.ui64 = 0;
+		zzp->zz_waittime.value.ui64 = 0;
+	} else {
+		kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+		/*
+		 * Extract the ZFS statistics from the kstat_io_t structure
+		 * used by kstat_runq_enter() and related functions. Since the
+		 * I/O throttle counters are updated directly by the ZFS layer,
+		 * there's no need to copy those statistics here.
+		 *
+		 * Note that kstat_runq_enter() and the related functions use
+		 * gethrtime_unscaled(), so scale the time here.
+		 */
+		zzp->zz_nread.value.ui64 = kiop->nread;
+		zzp->zz_reads.value.ui64 = kiop->reads;
+		zzp->zz_rtime.value.ui64 = kiop->rtime;
+		zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+		zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+		zzp->zz_writes.value.ui64 = kiop->writes;
+		zzp->zz_waittime.value.ui64 =
+		    zp->zpers_zfsp->zpers_zfs_rd_waittime;
+	}
+	mutex_exit(&zp->zpers_zfs_lock);
+
+	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_zfs_kstat_t *zzp;
+
+	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_zfs_lock;
+	zone->zone_zfs_stats = zzp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_zfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
 
 static int
 zone_mcap_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
 	zone_mcap_kstat_t *zmp = ksp->ks_data;
+	zone_persist_t *zp;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
+	zp = &zone_pdata[zone->zone_id];
+
+	zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
+	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+	zmp->zm_nover.value.ui64 = zp->zpers_nover;
+#ifndef DEBUG
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
+#else
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+	    zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
+#endif
 	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
 	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
 	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone)
 	/* The kstat "name" field is not large enough for a full zonename */
 	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
 	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
 	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
@@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
 	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
 	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
 
+	zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
 	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
 
 	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
+	zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
 	zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
 
 	return (0);
@@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone)
 	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
 	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
+	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
 
 	ksp->ks_update = zone_misc_kstat_update;
@@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone)
 static void
 zone_kstat_create(zone_t *zone)
 {
-	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
 	    "lockedmem", zone_lockedmem_kstat_update);
-	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
 	    "swapresv", zone_swapresv_kstat_update);
-	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+	    "physicalmem", zone_physmem_kstat_update);
+	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
 	    "nprocs", zone_nprocs_kstat_update);
 
+	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+		zone->zone_vfs_stats = kmem_zalloc(
+		    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+		zone->zone_zfs_stats = kmem_zalloc(
+		    sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	}
+
 	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
 		zone->zone_mcap_stats = kmem_zalloc(
 		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone)
 	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
 	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_physmem_kstat,
+	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
 	    sizeof (zone_kstat_t));
+
+	zone_kstat_delete_common(&zone->zone_vfs_ksp,
+	    sizeof (zone_vfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_zfs_ksp,
+	    sizeof (zone_zfs_kstat_t));
 	zone_kstat_delete_common(&zone->zone_mcap_ksp,
 	    sizeof (zone_mcap_kstat_t));
 	zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2101,8 +2579,12 @@ zone_zsd_init(void)
 	zone0.zone_initname = initname;
 	zone0.zone_lockedmem_kstat = NULL;
 	zone0.zone_swapresv_kstat = NULL;
+	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
 
+	zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+	zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
 	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
 	    offsetof(zone_ref_t, zref_linkage));
 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2209,6 +2691,21 @@ zone_init(void)
 	    RCTL_GLOBAL_INFINITE,
 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
+	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    16384, 16384, &zone_zfs_io_pri_ops);
+
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2250,6 +2747,20 @@ zone_init(void)
 	rde = rctl_dict_lookup("zone.cpu-shares");
 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
 
+	/*
+	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
+	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+	 */
+	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+	bzero(dval, sizeof (rctl_val_t));
+	dval->rcv_value = 1;
+	dval->rcv_privilege = RCPRIV_PRIVILEGED;
+	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+	dval->rcv_action_recip_pid = -1;
+
+	rde = rctl_dict_lookup("zone.zfs-io-priority");
+	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2260,6 +2771,11 @@ zone_init(void)
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_max_swap_ops);
 
+	rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_phys_mem_ops);
+
 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2281,6 +2797,9 @@ zone_init(void)
 	zone0.zone_ntasks = 1;
 	mutex_exit(&p0.p_lock);
 	zone0.zone_restart_init = B_TRUE;
+	zone0.zone_reboot_on_init_exit = B_FALSE;
+	zone0.zone_restart_init_0 = B_FALSE;
+	zone0.zone_init_status = -1;
 	zone0.zone_brand = &native_brand;
 	rctl_prealloc_destroy(gp);
 	/*
@@ -2362,6 +2881,8 @@ zone_init(void)
 static void
 zone_free(zone_t *zone)
 {
+	zone_dl_t *zdl;
+
 	ASSERT(zone != global_zone);
 	ASSERT(zone->zone_ntasks == 0);
 	ASSERT(zone->zone_nlwps == 0);
@@ -2377,6 +2898,9 @@ zone_free(zone_t *zone)
 	 */
 	cpucaps_zone_remove(zone);
 
+	/* Clear physical memory capping data. */
+	bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
+
 	ASSERT(zone->zone_cpucap == NULL);
 
 	/* remove from deathrow list */
@@ -2390,8 +2914,30 @@ zone_free(zone_t *zone)
 	list_destroy(&zone->zone_ref_list);
 	zone_free_zsd(zone);
 	zone_free_datasets(zone);
+
+	/*
+	 * While dlmgmtd should have removed all of these, it could have left
+	 * something behind or crashed. In which case it's not safe for us to
+	 * assume that the list is empty which list_destroy() will ASSERT. We
+	 * clean up for our userland comrades which may have crashed, or worse,
+	 * been disabled by SMF.
+	 */
+	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+		if (zdl->zdl_net != NULL)
+			nvlist_free(zdl->zdl_net);
+		kmem_free(zdl, sizeof (zone_dl_t));
+	}
 	list_destroy(&zone->zone_dl_list);
 
+	/*
+	 * This zone_t can no longer inhibit creation of another zone_t
+	 * with the same name or debug ID.  Generate a sysevent so that
+	 * userspace tools know it is safe to carry on.
+	 */
+	mutex_enter(&zone_status_lock);
+	zone_status_set(zone, ZONE_IS_FREE);
+	mutex_exit(&zone_status_lock);
+
 	cpu_uarray_free(zone->zone_ustate);
 
 	if (zone->zone_rootvp != NULL)
@@ -2436,11 +2982,17 @@ zone_free(zone_t *zone)
 static void
 zone_status_set(zone_t *zone, zone_status_t status)
 {
+	timestruc_t now;
+	uint64_t t;
 
 	nvlist_t *nvl = NULL;
 	ASSERT(MUTEX_HELD(&zone_status_lock));
-	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
-	    status >= zone_status_get(zone));
+	ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
+	    status == ZONE_IS_FREE) && status >= zone_status_get(zone));
+
+	/* Current time since Jan 1 1970 but consumers expect NS */
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
@@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status)
 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
 	    zone_status_table[zone->zone_status]) ||
 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
-	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
 #ifdef DEBUG
 		(void) printf(
 		    "Failed to allocate and send zone state change event.\n");
+#else
+		/* EMPTY */
 #endif
 	}
 	nvlist_free(nvl);
@@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone)
 	return (zone->zone_status);
 }
 
+/*
+ * Publish a zones-related sysevent for purposes other than zone state changes.
+ * While it is unfortunate that zone_event_chan is associated with
+ * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
+ * the only ones with class "status" and subclass "change".
+ */
+void
+zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
+    nvlist_t *ev_nvl)
+{
+	nvlist_t *nvl = NULL;
+	timestruc_t now;
+	uint64_t t;
+
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
+	if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
+	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
+	    nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
+	    sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
+	    "kernel", nvl, EVCH_SLEEP) != 0) {
+#ifdef DEBUG
+		(void) printf("Failed to allocate and send zone misc event.\n");
+#else
+		/* EMPTY */
+#endif
+	}
+	nvlist_free(nvl);
+}
+
 static int
 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
 {
@@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand)
 		return (EINVAL);
 	}
 
-	/* set up the brand specific data */
+	/*
+	 * Set up the brand specific data.
+	 * Note that it's possible that the hook has to drop the
+	 * zone_status_lock and reaquire it before returning so we can't
+	 * assume the lock has been held the entire time.
+	 */
 	zone->zone_brand = bp;
-	ZBROP(zone)->b_init_brand_data(zone);
+	ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
 
 	mutex_exit(&zone_status_lock);
 	return (0);
@@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 }
 
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
-{
-	uint64_t mcap;
-	int err = 0;
-
-	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-		zone->zone_phys_mcap = mcap;
-
-	return (err);
-}
-
-static int
 zone_set_sched_class(zone_t *zone, const char *new_class)
 {
 	char sched_class[PC_CLNMSZ];
@@ -3020,6 +3599,12 @@ getzoneid(void)
 	return (curproc->p_zone->zone_id);
 }
 
+zoneid_t
+getzonedid(void)
+{
+	return (curproc->p_zone->zone_did);
+}
+
 /*
  * Internal versions of zone_find_by_*().  These don't zone_hold() or
  * check the validity of a zone's state.
@@ -3766,6 +4351,17 @@ zone_start_init(void)
 	 */
 	z->zone_proc_initpid = p->p_pid;
 
+	if (z->zone_setup_app_contract == B_TRUE) {
+		/*
+		 * Normally a process cannot modify its own contract, but we're
+		 * just starting the zone's init process and its contract is
+		 * always initialized from the sys_process_tmpl template, so
+		 * this is the simplest way to setup init's contract to kill
+		 * the process if any other process in the contract exits.
+		 */
+		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+	}
+
 	/*
 	 * We maintain zone_boot_err so that we can return the cause of the
 	 * failure back to the caller of the zone_boot syscall.
@@ -3794,9 +4390,54 @@ zone_start_init(void)
 			lwp_exit();
 		}
 	} else {
+		id_t cid = curthread->t_cid;
+
 		if (zone_status_get(z) == ZONE_IS_BOOTING)
 			zone_status_set(z, ZONE_IS_RUNNING);
 		mutex_exit(&zone_status_lock);
+
+		mutex_enter(&class_lock);
+		ASSERT(cid < loaded_classes);
+		if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+		    z->zone_fixed_hipri) {
+			/*
+			 * If the zone is using FX then by default all
+			 * processes start at the lowest priority and stay
+			 * there. We provide a mechanism for the zone to
+			 * indicate that it should run at "high priority". In
+			 * this case we setup init to run at the highest FX
+			 * priority (which is one level higher than the
+			 * non-fixed scheduling classes can use).
+			 */
+			pcparms_t pcparms;
+
+			pcparms.pc_cid = cid;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+			    FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+			    FX_DOUPRILIM | FX_DOUPRI;
+
+			mutex_enter(&pidlock);
+			mutex_enter(&curproc->p_lock);
+
+			(void) parmsset(&pcparms, curthread);
+
+			mutex_exit(&curproc->p_lock);
+			mutex_exit(&pidlock);
+		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+			/*
+			 * zsched always starts the init lwp at priority
+			 * minclsyspri - 1. This priority gets set in t_pri and
+			 * is invalid for RT, but RT never uses t_pri. However
+			 * t_pri is used by procfs, so we always see processes
+			 * within an RT zone with an invalid priority value.
+			 * We fix that up now.
+			 */
+			curthread->t_pri = RTGPPRIO0;
+		}
+		mutex_exit(&class_lock);
+
 		/* cause the process to return to userland. */
 		lwp_rtt();
 	}
@@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
 
 		error = EINVAL;
 		name = nvpair_name(nvp);
-		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
-		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
 			goto out;
 		}
 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root,
     caddr_t rctlbuf, size_t rctlbufsz,
     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
     int match, uint32_t doi, const bslabel_t *label,
-    int flags)
+    int flags, zoneid_t zone_did)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root,
 
 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
 	zone->zone_id = zoneid;
+	zone->zone_did = zone_did;
 	zone->zone_status = ZONE_IS_UNINITIALIZED;
 	zone->zone_pool = pool_default;
 	zone->zone_pool_mod = gethrtime();
@@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_ncpus = 0;
 	zone->zone_ncpus_online = 0;
 	zone->zone_restart_init = B_TRUE;
+	zone->zone_reboot_on_init_exit = B_FALSE;
+	zone->zone_restart_init_0 = B_FALSE;
+	zone->zone_init_status = -1;
 	zone->zone_brand = &native_brand;
 	zone->zone_initname = NULL;
 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_max_swap_ctl = UINT64_MAX;
 	zone->zone_max_lofi = 0;
 	zone->zone_max_lofi_ctl = UINT64_MAX;
-	zone0.zone_lockedmem_kstat = NULL;
-	zone0.zone_swapresv_kstat = NULL;
+	zone->zone_lockedmem_kstat = NULL;
+	zone->zone_swapresv_kstat = NULL;
+	zone->zone_physmem_kstat = NULL;
+
+	zone_pdata[zoneid].zpers_zfsp =
+	    kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+	zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
 
 	zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
 
@@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root,
 	 */
 	zone->zone_rctls = NULL;
 
+	/*
+	 * Ensure page count is 0 (in case zoneid has wrapped).
+	 * Initialize physical memory cap as unlimited.
+	 */
+	zone_pdata[zoneid].zpers_pg_cnt = 0;
+	zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
+
 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
 		zone_free(zone);
 		return (zone_create_error(error, 0, extended_error));
@@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	/*
 	 * The process, task, and project rctls are probably wrong;
 	 * we need an interface to get the default values of all rctls,
-	 * and initialize zsched appropriately.  I'm not sure that that
-	 * makes much of a difference, though.
+	 * and initialize zsched appropriately. However, we allow zoneadmd
+	 * to pass down both zone and project rctls for the zone's init.
 	 */
 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
 	if (error != 0) {
@@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid)
 static int
 zone_empty(zone_t *zone)
 {
+	int cnt = 0;
 	int waitstatus;
 
 	/*
@@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone)
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 	while ((waitstatus = zone_status_timedwait_sig(zone,
 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
-		killall(zone->zone_id);
+		boolean_t force = B_FALSE;
+
+		/* Every 30 seconds, try harder */
+		if (cnt++ >= 30) {
+			cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+			    zone->zone_id);
+			force = B_TRUE;
+			cnt = 0;
+		}
+		killall(zone->zone_id, force);
 	}
 	/*
 	 * return EINTR if we were signaled
@@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid)
 	zone_status_t status;
 	clock_t wait_time;
 	boolean_t log_refcounts;
+	zone_persist_t *zp;
 
 	if (secpolicy_zone_config(CRED()) != 0)
 		return (set_errno(EPERM));
@@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid)
 	zone_hold(zone);
 	mutex_exit(&zonehash_lock);
 
+	zp = &zone_pdata[zoneid];
+	mutex_enter(&zp->zpers_zfs_lock);
+	kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+	zp->zpers_zfsp = NULL;
+	mutex_exit(&zp->zpers_zfs_lock);
+
 	/*
 	 * wait for zsched to exit
 	 */
@@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		size = sizeof (zone->zone_phys_mcap);
-		if (bufsize > size)
-			bufsize = size;
-		if (buf != NULL &&
-		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-			error = EFAULT;
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		mutex_enter(&class_lock);
 
@@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		}
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_DID:
+		size = sizeof (zoneid_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		size = sizeof (boolean_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+		    bufsize) != 0)
+			error = EFAULT;
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-	 * global zone.
+	 * No attributes can be set on the global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+	if (zoneid == GLOBAL_ZONEID) {
 		return (set_errno(EINVAL));
 	}
 
@@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	mutex_exit(&zonehash_lock);
 
 	/*
-	 * At present most attributes can only be set on non-running,
+	 * At present attributes can only be set on non-running,
 	 * non-global zones.
 	 */
 	zone_status = zone_status_get(zone);
-	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+	if (zone_status > ZONE_IS_READY) {
 		err = EINVAL;
 		goto done;
 	}
@@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		zone->zone_restart_init = B_FALSE;
 		err = 0;
 		break;
+	case ZONE_ATTR_INITRESTART0:
+		zone->zone_restart_init_0 = B_TRUE;
+		err = 0;
+		break;
+	case ZONE_ATTR_INITREBOOT:
+		zone->zone_reboot_on_init_exit = B_TRUE;
+		err = 0;
+		break;
 	case ZONE_ATTR_BOOTARGS:
 		err = zone_set_bootargs(zone, (const char *)buf);
 		break;
@@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_SECFLAGS:
 		err = zone_set_secflags(zone, (psecflags_t *)buf);
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		err = zone_set_sched_class(zone, (const char *)buf);
 		break;
@@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		err = zone_set_network(zoneid, zbuf);
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_APP_SVC_CT:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_setup_app_contract = (boolean_t)buf;
+			err = 0;
+		}
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_fixed_hipri = (boolean_t)buf;
+			err = 0;
+		}
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 			zs.doi = zs32.doi;
 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
 			zs.flags = zs32.flags;
+			zs.zoneid = zs32.zoneid;
 #else
 			panic("get_udatamodel() returned bogus result\n");
 #endif
@@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
 		    zs.extended_error, zs.match, zs.doi,
-		    zs.label, zs.flags));
+		    zs.label, zs.flags, zs.zoneid));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
 	case ZONE_DESTROY:
@@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
 	bcopy(zone->zone_name, zone_name, zone_namelen);
 	zoneid = zone->zone_id;
 	uniqid = zone->zone_uniqid;
+	arg.status = zone->zone_init_status;
 	/*
 	 * zoneadmd may be down, but at least we can empty out the zone.
 	 * We can ignore the return value of zone_empty() since we're called
@@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
 	 * later.
 	 */
-	killall(zone->zone_id);
+	killall(zone->zone_id, B_FALSE);
 	/*
 	 * Now, create the thread to contact zoneadmd and do the rest of the
 	 * work.  This thread can't be created in our zone otherwise
@@ -6837,16 +7543,15 @@ zone_shutdown_global(void)
 }
 
 /*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
  * The 'write' parameter is set to 1 if the dataset is also writable.
  */
 int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
 {
 	static int zfstype = -1;
 	zone_dataset_t *zd;
 	size_t len;
-	zone_t *zone = curproc->p_zone;
 	const char *name = NULL;
 	vfs_t *vfsp = NULL;
 
@@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write)
 	vfs_list_read_lock();
 	vfsp = zone->zone_vfslist;
 	do {
-		ASSERT(vfsp);
+		if (vfsp == NULL)
+			break;
 		if (vfsp->vfs_fstype == zfstype) {
 			name = refstr_value(vfsp->vfs_resource);
 
@@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write)
 }
 
 /*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	zone_t *zone = curproc->p_zone;
+
+	return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
  * zone_find_by_any_path() -
  *
  * kernel-private routine similar to zone_find_by_path(), but which
@@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
 	zone_t *zone;
 	zone_t *thiszone;
 
+	/*
+	 * Only the GZ may add a datalink to a zone's list.
+	 */
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (set_errno(EPERM));
+
+	/*
+	 * Only a process with the datalink config priv may add a
+	 * datalink to a zone's list.
+	 */
+	if (secpolicy_dl_config(CRED()) != 0)
+		return (set_errno(EPERM));
+
+	/*
+	 * When links exist in the GZ, they aren't added to the GZ's
+	 * zone_dl_list. We must enforce this because link_activate()
+	 * depends on zone_check_datalink() returning only NGZs.
+	 */
+	if (zoneid == GLOBAL_ZONEID)
+		return (set_errno(EINVAL));
+
 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
 		return (set_errno(ENXIO));
 
@@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
 	zone_t *zone;
 	int err = 0;
 
+	/*
+	 * Only the GZ may remove a datalink from a zone's list.
+	 */
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (set_errno(EPERM));
+
+	/*
+	 * Only a process with the datalink config priv may remove a
+	 * datalink from a zone's list.
+	 */
+	if (secpolicy_dl_config(CRED()) != 0)
+		return (set_errno(EPERM));
+
+	/*
+	 * If we can't add a datalink to the GZ's zone_dl_list then we
+	 * certainly can't remove them either.
+	 */
+	if (zoneid == GLOBAL_ZONEID)
+		return (set_errno(EINVAL));
+
 	if ((zone = zone_find_by_id(zoneid)) == NULL)
 		return (set_errno(EINVAL));
 
@@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
 }
 
 /*
- * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
- * the linkid.  Otherwise we just check if the specified zoneidp has been
- * assigned the supplied linkid.
+ *
+ * This function may be used in two ways:
+ *
+ * 1. to get the zoneid of the zone this link is under, or
+ *
+ * 2. to verify that the link is under a specific zone.
+ *
+ * The first use is achieved by passing a zoneid of ALL_ZONES. The
+ * function then iterates the datalink list of every zone on the
+ * system until it finds the linkid. If the linkid is found then the
+ * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
+ * returned and zoneidp is not modified. The use of ALL_ZONES is
+ * limited to callers in the GZ to prevent leaking information to
+ * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
+ * to the second type in the list above.
+ *
+ * The second use is achieved by passing a specific zoneid. The GZ can
+ * use this to verify a link is under a particular zone. An NGZ can
+ * use this to verify a link is under itself. But an NGZ cannot use
+ * this to determine if a link is under some other zone as that would
+ * result in information leakage. If the link exists under the zone
+ * then 0 is returned. Otherwise, ENXIO is returned.
  */
 int
 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
 {
 	zone_t *zone;
+	zoneid_t zoneid = *zoneidp;
+	zoneid_t caller = getzoneid();
 	int err = ENXIO;
 
-	if (*zoneidp != ALL_ZONES) {
-		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
-			if (zone_dl_exists(zone, linkid))
+	/*
+	 * Only the GZ may enquire about all zones; an NGZ may only
+	 * enuqire about itself.
+	 */
+	if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
+		zoneid = caller;
+
+	if (zoneid != caller && caller != GLOBAL_ZONEID)
+		return (err);
+
+	if (zoneid != ALL_ZONES) {
+		if ((zone = zone_find_by_id(zoneid)) != NULL) {
+			if (zone_dl_exists(zone, linkid)) {
+				/*
+				 * We need to set this in case an NGZ
+				 * passes ALL_ZONES.
+				 */
+				*zoneidp = zoneid;
 				err = 0;
+			}
 			zone_rele(zone);
 		}
 		return (err);
 	}
 
+	ASSERT(caller == GLOBAL_ZONEID);
 	mutex_enter(&zonehash_lock);
 	for (zone = list_head(&zone_active); zone != NULL;
 	    zone = list_next(&zone_active, zone)) {
@@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
 		}
 	}
 	mutex_exit(&zonehash_lock);
+
 	return (err);
 }
 
@@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
 	zone_dl_t *zdl;
 	datalink_id_t *idptr = idarray;
 
+	/*
+	 * Only the GZ or the owning zone may look at the datalink list.
+	 */
+	if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
+		return (set_errno(EPERM));
+
 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
 		return (set_errno(EFAULT));
 	if ((zone = zone_find_by_id(zoneid)) == NULL)
@@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
 	mutex_exit(&zone->zone_lock);
 	zone_rele(zone);
 
+	/*
+	 * Prevent returning negative nump values -- we should never
+	 * have this many links anyways.
+	 */
+	if (num > INT_MAX)
+		return (set_errno(EOVERFLOW));
+
 	/* Increased or decreased, caller should be notified. */
 	if (num != dlcount) {
 		if (copyout(&num, nump, sizeof (num)) != 0)
@@ -7388,3 +8199,231 @@ done:
 	else
 		return (0);
 }
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+	zone_persist_t *zp = &zone_pdata[zid];
+
+	/* See if over (unlimited is UINT32_MAX), or already marked that way. */
+	if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
+		return;
+	}
+
+	mutex_enter(&zone_physcap_lock);
+	/* Recheck setting under mutex */
+	if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+		zp->zpers_over = 1;
+		zp->zpers_nover++;
+		zone_num_over_cap++;
+		DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+	}
+	mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ *   cap    pages     pages     1% shift7  shift7
+ *  128M    32768 0x0008000    327    256 0x00100
+ *  512M   131072 0x0020000   1310   1024 0x00400
+ *    1G   262144 0x0040000   2621   2048 0x00800
+ *    4G  1048576 0x0100000  10485   8192 0x02000
+ *    8G  2097152 0x0200000  20971  16384 0x04000
+ *   16G  4194304 0x0400000  41943  32768 0x08000
+ *   32G  8388608 0x0800000  83886  65536 0x10000
+ *   64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+	zone_persist_t *zp = &zone_pdata[zid];
+	uint32_t adjusted_limit;
+
+	/*
+	 * See if under, or already marked that way. There is no need to
+	 * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+	 * since we'll never set zpers_over in zone_incr_capped().
+	 */
+	if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
+		return;
+	}
+
+	adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
+
+	/* Recheck, accounting for our hysteresis. */
+	if (zp->zpers_pg_cnt >= adjusted_limit) {
+		return;
+	}
+
+	mutex_enter(&zone_physcap_lock);
+	/* Recheck under mutex. */
+	if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+		zp->zpers_over = 0;
+		ASSERT(zone_num_over_cap > 0);
+		zone_num_over_cap--;
+		DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+	}
+	mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+	uint_t pcnt;
+	zone_persist_t *zp;
+	zoneid_t zid;
+
+	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
+	if (PP_ISKAS(pp))
+		return;
+
+	ASSERT(!PP_ISFREE(pp));
+
+	zid = curzone->zone_id;
+	if (pp->p_zoneid == zid) {
+		/* Another mapping to this page for this zone, do nothing */
+		return;
+	}
+
+	if (pp->p_szc == 0) {
+		pcnt = 1;
+	} else {
+		/* large page */
+		pcnt = page_get_pagecnt(pp->p_szc);
+	}
+
+	if (pp->p_share == 0) {
+		/* First mapping to this page. */
+		pp->p_zoneid = zid;
+		zp = &zone_pdata[zid];
+		ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
+		zone_incr_capped(zid);
+		return;
+	}
+
+	if (pp->p_zoneid != ALL_ZONES) {
+		/*
+		 * The page is now being shared across a different zone.
+		 * Decrement the original zone's usage.
+		 */
+		zid = pp->p_zoneid;
+		pp->p_zoneid = ALL_ZONES;
+		ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+		zp = &zone_pdata[zid];
+
+		if (zp->zpers_pg_cnt > 0) {
+			atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+		}
+		zone_decr_capped(zid);
+	}
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+	uint_t pcnt;
+	zone_persist_t *zp;
+	zoneid_t zid;
+
+	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
+	if (PP_ISKAS(pp))
+		return;
+
+	zid = pp->p_zoneid;
+	if (zid == ALL_ZONES || pp->p_share != 0)
+		return;
+
+	/* This is the last mapping to the page for a zone. */
+	if (pp->p_szc == 0) {
+		pcnt = 1;
+	} else {
+		/* large page */
+		pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+	}
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+	if (zp->zpers_pg_cnt > 0) {
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+	}
+	zone_decr_capped(zid);
+	pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+	zone_persist_t *zp;
+
+	if (zid == ALL_ZONES)
+		return;
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+
+#ifndef DEBUG
+	atomic_add_64(&zp->zpers_pg_out, 1);
+#else
+	switch (op) {
+	case ZPO_DIRTY:
+		atomic_add_64(&zp->zpers_pg_fsdirty, 1);
+		break;
+	case ZPO_FS:
+		atomic_add_64(&zp->zpers_pg_fs, 1);
+		break;
+	case ZPO_ANON:
+		atomic_add_64(&zp->zpers_pg_anon, 1);
+		break;
+	case ZPO_ANONDIRTY:
+		atomic_add_64(&zp->zpers_pg_anondirty, 1);
+		break;
+	default:
+		cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+		break;
+	}
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+	zone_persist_t *zp;
+
+	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+	zp = &zone_pdata[zid];
+
+	/*
+	 * If memory or swap limits are set on the zone, use those, otherwise
+	 * use the system values. physmem and freemem are also in pages.
+	 */
+	if (zp->zpers_pg_limit == UINT32_MAX) {
+		*memcap = physmem;
+		*free = freemem;
+	} else {
+		int64_t freemem;
+
+		*memcap = (pgcnt_t)zp->zpers_pg_limit;
+		freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
+		if (freemem > 0) {
+			*free = (pgcnt_t)freemem;
+		} else {
+			*free = (pgcnt_t)0;
+		}
+	}
+}