26 files changed, 1582 insertions, 671 deletions
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c
index 96502b8230..b8d2e29058 100644
--- a/usr/src/uts/common/os/bio.c
+++ b/usr/src/uts/common/os/bio.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
+
+		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
+
 		if ((flags & B_ASYNC) == 0) {
 			klwp_t *lwp = ttolwp(curthread);
 			if (lwp != NULL)
@@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
 		if (pp != NULL && pp->p_vnode != NULL) {
 			if (IS_SWAPFSVP(pp->p_vnode)) {
 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
+				atomic_add_64(&curzone->zone_anonpgin,
+				    btopr(len));
 			} else {
 				if (pp->p_vnode->v_flag & VVMEXEC) {
 					CPU_STATS_ADDQ(cpup, vm, execpgin,
 					    btopr(len));
+					atomic_add_64(&curzone->zone_execpgin,
+					    btopr(len));
 				} else {
 					CPU_STATS_ADDQ(cpup, vm, fspgin,
 					    btopr(len));
+					atomic_add_64(&curzone->zone_fspgin,
+					    btopr(len));
 				}
 			}
 		}
diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c
index eb8c6e730a..02901d023d 100644
--- a/usr/src/uts/common/os/brand.c
+++ b/usr/src/uts/common/os/brand.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/kmem.h>
@@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops  = {
 };
 #else /* !__sparcv9 */
 struct brand_mach_ops native_mach_ops  = {
-		NULL, NULL, NULL, NULL
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL
 };
 #endif /* !__sparcv9 */
 
@@ -53,7 +54,8 @@ brand_t native_brand = {
 		BRAND_VER_1,
 		"native",
 		NULL,
-		&native_mach_ops
+		&native_mach_ops,
+		0
 };
 
 /*
@@ -310,46 +312,112 @@ brand_unregister_zone(struct brand *bp)
 	mutex_exit(&brand_list_lock);
 }
 
-void
-brand_setbrand(proc_t *p)
+int
+brand_setbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
+	void *brand_data = NULL;
 
-	ASSERT(bp != NULL);
-	ASSERT(p->p_brand == &native_brand);
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
 
 	/*
-	 * We should only be called from exec(), when we know the process
-	 * is single-threaded.
+	 * Process branding occurs during fork() and exec().  When it happens
+	 * during fork(), the LWP count will always be 0 since branding is
+	 * performed as part of getproc(), before LWPs have been associated.
+	 * The same is not true during exec(), where a multi-LWP process may
+	 * undergo branding just prior to gexec(). This is to ensure
+	 * exec-related brand hooks are available.  While it may seem
+	 * complicated to brand a multi-LWP process, the two possible outcomes
+	 * simplify things:
+	 *
+	 * 1. The exec() succeeds:  LWPs besides the caller will be killed and
+	 *    any further branding will occur in a single-LWP context.
+	 * 2. The exec() fails: The process will be promptly unbranded since
+	 *    the hooks are no longer needed.
+	 *
+	 * To prevent inconsistent brand state from being encountered during
+	 * the exec(), LWPs beyond the caller which are associated with this
+	 * process must be held temporarily.  They will be released either when
+	 * they are killed in the exec() success, or when the brand is cleared
+	 * after exec() failure.
 	 */
-	ASSERT(p->p_tlist == p->p_tlist->t_forw);
+	if (lwps_ok) {
+		/*
+		 * We've been called from a exec() context tolerating the
+		 * existence of multiple LWPs during branding is necessary.
+		 */
+		VERIFY(p == curproc);
+		VERIFY(p->p_tlist != NULL);
 
+		if (p->p_tlist != p->p_tlist->t_forw) {
+			/*
+			 * Multiple LWPs are present.  Hold all but the caller.
+			 */
+			if (!holdlwps(SHOLDFORK1)) {
+				return (-1);
+			}
+		}
+	} else {
+		/*
+		 * Processes branded during fork() should not have LWPs at all.
+		 */
+		VERIFY(p->p_tlist == NULL);
+	}
+
+	if (bp->b_data_size > 0) {
+		brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP);
+	}
+
+	mutex_enter(&p->p_lock);
+	ASSERT(!PROC_IS_BRANDED(p));
 	p->p_brand = bp;
+	p->p_brand_data = brand_data;
 	ASSERT(PROC_IS_BRANDED(p));
 	BROP(p)->b_setbrand(p);
+	mutex_exit(&p->p_lock);
+	return (0);
 }
 
 void
-brand_clearbrand(proc_t *p, boolean_t no_lwps)
+brand_clearbrand(proc_t *p, boolean_t lwps_ok)
 {
 	brand_t *bp = p->p_zone->zone_brand;
-	klwp_t *lwp = NULL;
-	ASSERT(bp != NULL);
-	ASSERT(!no_lwps || (p->p_tlist == NULL));
+	void *brand_data;
 
-	/*
-	 * If called from exec_common() or proc_exit(),
-	 * we know the process is single-threaded.
-	 * If called from fork_fail, p_tlist is NULL.
-	 */
-	if (!no_lwps) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		lwp = p->p_tlist->t_lwp;
-	}
+	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
+	VERIFY(bp != NULL);
+	VERIFY(PROC_IS_BRANDED(p));
 
-	ASSERT(PROC_IS_BRANDED(p));
-	BROP(p)->b_proc_exit(p, lwp);
+	mutex_enter(&p->p_lock);
 	p->p_brand = &native_brand;
+	brand_data = p->p_brand_data;
+	p->p_brand_data = NULL;
+
+	if (lwps_ok) {
+		VERIFY(p == curproc);
+		/*
+		 * A process with multiple LWPs is being de-branded after
+		 * failing an exec.  The other LWPs were held as part of the
+		 * procedure, so they must be resumed now.
+		 */
+		if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) {
+			continuelwps(p);
+		}
+	} else {
+		/*
+		 * While clearing the brand, it's ok for one LWP to be present.
+		 * This happens when a native binary is executed inside a
+		 * branded zone, since the brand will be removed during the
+		 * course of a successful exec.
+		 */
+		VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw);
+	}
+	mutex_exit(&p->p_lock);
+
+	if (brand_data != NULL) {
+		kmem_free(brand_data, bp->b_data_size);
+	}
 }
 
 #if defined(__sparcv9)
@@ -483,7 +551,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
 		return (ENOSYS);
 
 	/* For all other operations this must be a branded process. */
-	if (p->p_brand == &native_brand)
+	if (!PROC_IS_BRANDED(p))
 		return (ENOSYS);
 
 	ASSERT(p->p_brand == pbrand);
@@ -601,15 +669,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp)
 int
 brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
     intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file,
-    cred_t *cred, int brand_action, struct brand *pbrand, char *bname,
-    char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32)
+    cred_t *cred, int *brand_action, struct brand *pbrand, char *bname,
+    char *brandlib, char *brandlib32)
 {
 
 	vnode_t		*nvp;
 	Ehdr		ehdr;
 	Addr		uphdr_vaddr;
 	intptr_t	voffset;
-	int		interp;
+	char		*interp;
 	int		i, err;
 	struct execenv	env;
 	struct execenv	origenv;
@@ -619,7 +687,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	klwp_t		*lwp = ttolwp(curthread);
 	brand_proc_data_t	*spd;
 	brand_elf_data_t sed, *sedp;
-	char		*linker;
 	uintptr_t	lddata; /* lddata of executable's linker */
 
 	ASSERT(curproc->p_brand == pbrand);
@@ -636,12 +703,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	 */
 	if (args->to_model == DATAMODEL_NATIVE) {
 		args->emulator = brandlib;
-		linker = brandlinker;
 	}
 #if defined(_LP64)
 	else {
 		args->emulator = brandlib32;
-		linker = brandlinker32;
 	}
 #endif  /* _LP64 */
 
@@ -672,13 +737,13 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
 
 	if (args->to_model == DATAMODEL_NATIVE) {
-		err = elfexec(nvp, uap, args, idatap, level + 1, execsz,
+		err = elfexec(nvp, uap, args, idatap, INTP_MAXDEPTH + 1, execsz,
 		    setid, exec_file, cred, brand_action);
 	}
 #if defined(_LP64)
 	else {
-		err = elf32exec(nvp, uap, args, idatap, level + 1, execsz,
-		    setid, exec_file, cred, brand_action);
+		err = elf32exec(nvp, uap, args, idatap, INTP_MAXDEPTH + 1,
+		    execsz, setid, exec_file, cred, brand_action);
 	}
 #endif  /* _LP64 */
 	VN_RELE(nvp);
@@ -725,7 +790,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	if (args->to_model == DATAMODEL_NATIVE) {
 		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 	}
 #if defined(_LP64)
 	else {
@@ -733,7 +798,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		Elf32_Addr uphdr_vaddr32;
 		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
 		    &voffset, exec_file, &interp, &env.ex_bssbase,
-		    &env.ex_brkbase, &env.ex_brksize, NULL);
+		    &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
 		Ehdr32to64(&ehdr32, &ehdr);
 
 		if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -744,6 +809,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 #endif  /* _LP64 */
 	if (err != 0) {
 		restoreexecenv(&origenv, &orig_sigaltstack);
+
+		if (interp != NULL)
+			kmem_free(interp, MAXPATHLEN);
+
 		return (err);
 	}
 
@@ -761,7 +830,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 	sedp->sed_phent = ehdr.e_phentsize;
 	sedp->sed_phnum = ehdr.e_phnum;
 
-	if (interp) {
+	if (interp != NULL) {
 		if (ehdr.e_type == ET_DYN) {
 			/*
 			 * This is a shared object executable, so we
@@ -777,16 +846,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 		 * it in and store relevant information about it in the
 		 * aux vector, where the brand library can find it.
 		 */
-		if ((err = lookupname(linker, UIO_SYSSPACE,
+		if ((err = lookupname(interp, UIO_SYSSPACE,
 		    FOLLOW, NULLVPP, &nvp)) != 0) {
-			uprintf("%s: not found.", brandlinker);
+			uprintf("%s: not found.", interp);
 			restoreexecenv(&origenv, &orig_sigaltstack);
+			kmem_free(interp, MAXPATHLEN);
 			return (err);
 		}
+
+		kmem_free(interp, MAXPATHLEN);
+
 		if (args->to_model == DATAMODEL_NATIVE) {
 			err = mapexec_brand(nvp, args, &ehdr,
 			    &uphdr_vaddr, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 		}
 #if defined(_LP64)
 		else {
@@ -794,7 +867,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 			Elf32_Addr uphdr_vaddr32;
 			err = mapexec32_brand(nvp, args, &ehdr32,
 			    &uphdr_vaddr32, &voffset, exec_file, &interp,
-			    NULL, NULL, NULL, &lddata);
+			    NULL, NULL, NULL, &lddata, NULL);
 			Ehdr32to64(&ehdr32, &ehdr);
 
 			if (uphdr_vaddr32 == (Elf32_Addr)-1)
@@ -934,9 +1007,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args,
 
 	/*
 	 * Third, the /proc aux vectors set up by elfexec() point to
-	 * brand emulation library and it's linker.  Copy these to the
+	 * brand emulation library and its linker.  Copy these to the
 	 * /proc brand specific aux vector, and update the regular
-	 * /proc aux vectors to point to the executable (and it's
+	 * /proc aux vectors to point to the executable (and its
 	 * linker).  This will enable debuggers to access the
 	 * executable via the usual /proc or elf notes aux vectors.
 	 *
@@ -1078,55 +1151,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand)
 }
 
 /*ARGSUSED*/
-int
+void
 brand_solaris_initlwp(klwp_t *l, struct brand *pbrand)
 {
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand == NULL);
 	l->lwp_brand = (void *)-1;
-	return (0);
 }
 
 /*ARGSUSED*/
 void
 brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand)
 {
-	proc_t  *p = l->lwp_procp;
-
 	ASSERT(l->lwp_procp->p_brand == pbrand);
 	ASSERT(l->lwp_procp->p_brand_data != NULL);
 	ASSERT(l->lwp_brand != NULL);
-
-	/*
-	 * We should never be called for the last thread in a process.
-	 * (That case is handled by brand_solaris_proc_exit().)
-	 * Therefore this lwp must be exiting from a multi-threaded
-	 * process.
-	 */
-	ASSERT(p->p_tlist != p->p_tlist->t_forw);
-
-	l->lwp_brand = NULL;
 }
 
 /*ARGSUSED*/
 void
-brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand)
+brand_solaris_proc_exit(struct proc *p, struct brand *pbrand)
 {
 	ASSERT(p->p_brand == pbrand);
 	ASSERT(p->p_brand_data != NULL);
 
-	/*
-	 * When called from proc_exit(), we know that process is
-	 * single-threaded and free our lwp brand data.
-	 * otherwise just free p_brand_data and return.
-	 */
-	if (l != NULL) {
-		ASSERT(p->p_tlist == p->p_tlist->t_forw);
-		ASSERT(p->p_tlist->t_lwp == l);
-		(void) brand_solaris_freelwp(l, pbrand);
-	}
-
 	/* upon exit, free our proc brand data */
 	kmem_free(p->p_brand_data, sizeof (brand_proc_data_t));
 	p->p_brand_data = NULL;
@@ -1145,5 +1194,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand)
 	ASSERT(p->p_tlist == p->p_tlist->t_forw);
 
 	p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP);
-	(void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand);
 }
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index d4dddbe477..3ca17e1f17 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -64,6 +64,7 @@
 #include <sys/contract/process_impl.h>
 #include <sys/ddi.h>
 
+extern int yield(void);
 /*
  * Processes running within a zone potentially dump core in 3 locations,
  * based on the per-process, per-zone, and the global zone's core settings.
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 733fd03a92..b0098946b3 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -727,6 +727,14 @@ crgetzoneid(const cred_t *cr)
 	    cr->cr_zone->zone_id);
 }
 
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+	return (cr->cr_zone == NULL ?
+	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+	    cr->cr_zone->zone_did);
+}
+
 projid_t
 crgetprojid(const cred_t *cr)
 {
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index c3c0481e7f..a4b35dcb5b 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p)
 
 	/* Log callback errors */
 	if (ret != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n",
+		cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n",
 		    ddi_driver_name(req_p->ireq_dip),
 		    ddi_get_instance(req_p->ireq_dip), (int)action, ret);
 	}
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index f7c565e546..d46b8538a9 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -26,7 +26,7 @@
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 /*
- * Copyright 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -69,6 +69,7 @@
 #include <sys/sdt.h>
 #include <sys/brand.h>
 #include <sys/klpd.h>
+#include <sys/random.h>
 
 #include <c2/audit.h>
 
@@ -97,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
 #endif
 
 #define	PSUIDFLAGS		(SNOCD|SUGID)
+#define	RANDOM_LEN	16	/* 16 bytes for AT_RANDOM aux entry */
 
 /*
  * exece() - system call wrapper around exec_common()
@@ -297,14 +299,43 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	ua.argp = argp;
 	ua.envp = envp;
 
-	/* If necessary, brand this process before we start the exec. */
-	if (brandme)
-		brand_setbrand(p);
+	/* If necessary, brand this process/lwp before we start the exec. */
+	if (brandme) {
+		void *brand_data = NULL;
+
+		/*
+		 * Process branding may fail if multiple LWPs are present and
+		 * holdlwps() cannot complete successfully.
+		 */
+		error = brand_setbrand(p, B_TRUE);
+
+		if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
+			brand_data = BROP(p)->b_lwpdata_alloc(p);
+			if (brand_data == NULL) {
+				error = 1;
+			}
+		}
+
+		if (error == 0) {
+			mutex_enter(&p->p_lock);
+			BROP(p)->b_initlwp(lwp, brand_data);
+			mutex_exit(&p->p_lock);
+		} else {
+			VN_RELE(vp);
+			if (dir != NULL) {
+				VN_RELE(dir);
+			}
+			pn_free(&resolvepn);
+			goto fail;
+		}
+	}
 
 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
-	    exec_file, p->p_cred, brand_action)) != 0) {
-		if (brandme)
-			brand_clearbrand(p, B_FALSE);
+	    exec_file, p->p_cred, &brand_action)) != 0) {
+		if (brandme) {
+			BROP(p)->b_freelwp(lwp);
+			brand_clearbrand(p, B_TRUE);
+		}
 		VN_RELE(vp);
 		if (dir != NULL)
 			VN_RELE(dir);
@@ -336,7 +367,7 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	/*
 	 * Clear contract template state
 	 */
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_TRUE);
 
 	/*
 	 * Save the directory in which we found the executable for expanding
@@ -360,6 +391,8 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	 * pending held signals remain held, so don't clear t_hold.
 	 */
 	mutex_enter(&p->p_lock);
+	DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
+	    uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
 	lwp->lwp_oldcontext = 0;
 	lwp->lwp_ustack = 0;
 	lwp->lwp_old_stk_ctl = 0;
@@ -419,8 +452,10 @@ exec_common(const char *fname, const char **argp, const char **envp,
 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 
 	/* Unbrand ourself if necessary. */
-	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
+	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
+		BROP(p)->b_freelwp(lwp);
 		brand_clearbrand(p, B_FALSE);
+	}
 
 	setregs(&args);
 
@@ -544,7 +579,7 @@ gexec(
 	long *execsz,
 	caddr_t exec_file,
 	struct cred *cred,
-	int brand_action)
+	int *brand_action)
 {
 	struct vnode *vp, *execvp = NULL;
 	proc_t *pp = ttoproc(curthread);
@@ -858,8 +893,14 @@ gexec(
 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 				args->traceinval = 1;
 		}
-		if (pp->p_proc_flag & P_PR_PTRACE)
+
+		/*
+		 * If legacy ptrace is enabled, generate the SIGTRAP.
+		 */
+		if (pp->p_proc_flag & P_PR_PTRACE) {
 			psignal(pp, SIGTRAP);
+		}
+
 		if (args->traceinval)
 			prinvalidate(&pp->p_user);
 	}
@@ -1517,6 +1558,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
 	return (0);
 }
 
+/*
+ * Add a fixed size byte array to the stack (only from kernel space).
+ */
+static int
+stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
+{
+	int error;
+
+	if (STK_AVAIL(args) < sizeof (int))
+		return (E2BIG);
+	*--args->stk_offp = args->stk_strp - args->stk_base;
+
+	if (len > STK_AVAIL(args))
+		return (E2BIG);
+	bcopy(sp, args->stk_strp, len);
+
+	args->stk_strp += len;
+
+	return (0);
+}
+
 static int
 stk_getptr(uarg_t *args, char *src, char **dst)
 {
@@ -1553,16 +1615,30 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	size_t size, pad;
 	char *argv = (char *)uap->argp;
 	char *envp = (char *)uap->envp;
+	uint8_t rdata[RANDOM_LEN];
 
 	/*
 	 * Copy interpreter's name and argument to argv[0] and argv[1].
+	 * In the rare case that we have nested interpreters then those names
+	 * and arguments are also copied to the subsequent slots in argv.
 	 */
-	if (intp != NULL && intp->intp_name != NULL) {
-		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
-			return (error);
-		if (intp->intp_arg != NULL &&
-		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
-			return (error);
+	if (intp != NULL && intp->intp_name[0] != NULL) {
+		int i;
+
+		for (i = 0; i < INTP_MAXDEPTH; i++) {
+			if (intp->intp_name[i] == NULL)
+				break;
+			error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
+			if (error != 0)
+				return (error);
+			if (intp->intp_arg[i] != NULL) {
+				error = stk_add(args, intp->intp_arg[i],
+				    UIO_SYSSPACE);
+				if (error != 0)
+					return (error);
+			}
+		}
+
 		if (args->fname != NULL)
 			error = stk_add(args, args->fname, UIO_SYSSPACE);
 		else
@@ -1622,8 +1698,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 	args->ne = args->na - argc;
 
 	/*
-	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
-	 * AT_SUN_EMULATOR strings to the stack.
+	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
+	 * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
+	 * array, to the stack.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
@@ -1636,6 +1713,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		if (args->emulator != NULL &&
 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
 			return (error);
+
+		/*
+		 * For the AT_RANDOM aux vector we provide 16 bytes of random
+		 * data.
+		 */
+		(void) random_get_pseudo_bytes(rdata, sizeof (rdata));
+
+		if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
+			return (error);
+
+		if (args->brand_nroot != NULL &&
+		    (error = stk_add(args, args->brand_nroot,
+		    UIO_SYSSPACE)) != 0)
+			return (error);
 	}
 
 	/*
@@ -1742,7 +1833,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 	/*
 	 * Fill in the aux vector now that we know the user stack addresses
 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
-	 * AT_SUN_EMULATOR strings.
+	 * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
 	 */
 	if (auxvpp != NULL && *auxvpp != NULL) {
 		if (args->to_model == DATAMODEL_NATIVE) {
@@ -1755,6 +1846,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a,
 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a,
+				    AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
+			}
 		} else {
 			auxv32_t **a = (auxv32_t **)auxvpp;
 			ADDAUX(*a,
@@ -1767,6 +1863,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
 			if (args->emulator != NULL)
 				ADDAUX(*a, AT_SUN_EMULATOR,
 				    (int)(uintptr_t)&ustrp[*--offp])
+			ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
+			if (args->brand_nroot != NULL) {
+				ADDAUX(*a, AT_SUN_BRAND_NROOT,
+				    (int)(uintptr_t)&ustrp[*--offp])
+			}
 		}
 	}
 
@@ -1855,6 +1956,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
 		usrstack = (char *)USRSTACK32;
 	}
 
+	if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
+		usrstack = (char *)args->maxstack;
+
 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
 
 #if defined(__sparc)
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index f0c0983a3a..0e213deb21 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -230,7 +230,7 @@ restart_init(int what, int why)
 		siginfofree(lwp->lwp_curinfo);
 		lwp->lwp_curinfo = NULL;
 	}
-	lwp_ctmpl_clear(lwp);
+	lwp_ctmpl_clear(lwp, B_FALSE);
 
 	/*
 	 * Reset both the process root directory and the current working
@@ -366,19 +366,6 @@ proc_exit(int why, int what)
 	}
 	mutex_exit(&p->p_lock);
 
-	DTRACE_PROC(lwp__exit);
-	DTRACE_PROC1(exit, int, why);
-
-	/*
-	 * Will perform any brand specific proc exit processing, since this
-	 * is always the last lwp, will also perform lwp_exit and free brand
-	 * data
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		lwp_detach_brand_hdlrs(lwp);
-		brand_clearbrand(p, B_FALSE);
-	}
-
 	/*
 	 * Don't let init exit unless zone_start_init() failed its exec, or
 	 * we are shutting down the zone or the machine.
@@ -390,12 +377,35 @@ proc_exit(int why, int what)
 		if (z->zone_boot_err == 0 &&
 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
-			if (z->zone_restart_init == B_TRUE) {
-				if (restart_init(what, why) == 0)
-					return (0);
+
+			/*
+			 * If the init process should be restarted, the
+			 * "zone_restart_init" member will be set.  Some init
+			 * programs in branded zones do not tolerate a restart
+			 * in the traditional manner; setting the
+			 * "zone_reboot_on_init_exit" member will cause the
+			 * entire zone to be rebooted instead.  If neither of
+			 * these flags is set the zone will shut down.
+			 */
+			if (z->zone_reboot_on_init_exit == B_TRUE &&
+			    z->zone_restart_init == B_TRUE) {
+				/*
+				 * Trigger a zone reboot and continue
+				 * with exit processing.
+				 */
+				z->zone_init_status = wstat(why, what);
+				(void) zone_kadmin(A_REBOOT, 0, NULL,
+				    zone_kcred());
+
 			} else {
+				if (z->zone_restart_init == B_TRUE) {
+					if (restart_init(what, why) == 0)
+						return (0);
+				}
+
+				z->zone_init_status = wstat(why, what);
 				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
-				    CRED());
+				    zone_kcred());
 			}
 		}
 
@@ -407,6 +417,32 @@ proc_exit(int why, int what)
 		z->zone_proc_initpid = -1;
 	}
 
+	/*
+	 * Delay firing probes (and performing brand cleanup) until after the
+	 * zone_proc_initpid check. Cases which result in zone shutdown or
+	 * restart via zone_kadmin eventually result in a call back to
+	 * proc_exit.
+	 */
+	DTRACE_PROC(lwp__exit);
+	DTRACE_PROC1(exit, int, why);
+
+	/*
+	 * Will perform any brand specific proc exit processing. Since this
+	 * is always the last lwp, will also perform lwp exit/free and proc
+	 * exit. Brand data will be freed when the process is reaped.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_proc_exit(p);
+		/*
+		 * To ensure that b_proc_exit has access to brand-specific data
+		 * contained by the one remaining lwp, call the freelwp hook as
+		 * the last part of this clean-up process.
+		 */
+		BROP(p)->b_freelwp(lwp);
+		lwp_detach_brand_hdlrs(lwp);
+	}
+
 	lwp_pcb_exit();
 
 	/*
@@ -658,10 +694,22 @@ proc_exit(int why, int what)
 	if ((q = p->p_child) != NULL && p != proc_init) {
 		struct proc	*np;
 		struct proc	*initp = proc_init;
+		pid_t		zone_initpid = 1;
+		struct proc	*zoneinitp = NULL;
 		boolean_t	setzonetop = B_FALSE;
 
-		if (!INGLOBALZONE(curproc))
-			setzonetop = B_TRUE;
+		if (!INGLOBALZONE(curproc)) {
+			zone_initpid = curproc->p_zone->zone_proc_initpid;
+
+			ASSERT(MUTEX_HELD(&pidlock));
+			zoneinitp = prfind(zone_initpid);
+			if (zoneinitp != NULL) {
+				initp = zoneinitp;
+			} else {
+				zone_initpid = 1;
+				setzonetop = B_TRUE;
+			}
+		}
 
 		pgdetach(p);
 
@@ -673,7 +721,8 @@ proc_exit(int why, int what)
 			 */
 			delete_ns(q->p_parent, q);
 
-			q->p_ppid = 1;
+			q->p_ppid = zone_initpid;
+
 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
 			if (setzonetop) {
 				mutex_enter(&q->p_lock);
@@ -847,8 +896,50 @@ proc_exit(int why, int what)
 
 	mutex_exit(&p->p_lock);
 	if (!evaporate) {
-		p->p_pidflag &= ~CLDPEND;
-		sigcld(p, sqp);
+		/*
+		 * The brand specific code only happens when the brand has a
+		 * function to call in place of sigcld and the parent of the
+		 * exiting process is not the global zone init. If the parent
+		 * is the global zone init, then the process was reparented,
+		 * and we don't want brand code delivering possibly strange
+		 * signals to init. Also, init is not branded, so any brand
+		 * specific exit data will not be picked up by init anyway.
+		 */
+		if (PROC_IS_BRANDED(p) &&
+		    BROP(p)->b_exit_with_sig != NULL &&
+		    p->p_ppid != 1) {
+			/*
+			 * The code for _fini that could unload the brand_t
+			 * blocks until the count of zones using the module
+			 * reaches zero. Zones decrement the refcount on their
+			 * brands only after all user tasks in that zone have
+			 * exited and been waited on. The decrement on the
+			 * brand's refcount happen in zone_destroy(). That
+			 * depends on zone_shutdown() having been completed.
+			 * zone_shutdown() includes a call to zone_empty(),
+			 * where the zone waits for itself to reach the state
+			 * ZONE_IS_EMPTY. This state is only set in either
+			 * zone_shutdown(), when there are no user processes as
+			 * the zone enters this function, or in
+			 * zone_task_rele(). zone_task_rele() is called from
+			 * code triggered by waiting on processes, not by the
+			 * processes exiting through proc_exit().  This means
+			 * all the branded processes that could exist for a
+			 * specific brand_t must exit and get reaped before the
+			 * refcount on the brand_t can reach 0. _fini will
+			 * never unload the corresponding brand module before
+			 * proc_exit finishes execution for all processes
+			 * branded with a particular brand_t, which makes the
+			 * operation below safe to do. Brands that wish to use
+			 * this mechanism must wait in _fini as described
+			 * above.
+			 */
+			BROP(p)->b_exit_with_sig(p, sqp);
+		} else {
+			p->p_pidflag &= ~CLDPEND;
+			sigcld(p, sqp);
+		}
+
 	} else {
 		/*
 		 * Do what sigcld() would do if the disposition
@@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
 int
 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 {
-	int found;
 	proc_t *cp, *pp;
-	int proc_gone;
 	int waitflag = !(options & WNOWAIT);
+	boolean_t have_brand_helper = B_FALSE;
 
 	/*
 	 * Obsolete flag, defined here only for binary compatibility
@@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 	pp = ttoproc(curthread);
 
 	/*
-	 * lock parent mutex so that sibling chain can be searched.
+	 * Anytime you are looking for a process, you take pidlock to prevent
+	 * things from changing as you look.
 	 */
 	mutex_enter(&pidlock);
 
@@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		return (ECHILD);
 	}
 
-	while (pp->p_child != NULL) {
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) {
+		have_brand_helper = B_TRUE;
+	}
+
+	while (pp->p_child != NULL || have_brand_helper) {
+		boolean_t brand_wants_wait = B_FALSE;
+		int proc_gone = 0;
+		int found = 0;
 
-		proc_gone = 0;
+		/*
+		 * Give the brand a chance to return synthetic results from
+		 * this waitid() call before we do the real thing.
+		 */
+		if (have_brand_helper) {
+			int ret;
 
+			if (BROP(pp)->b_waitid_helper(idtype, id, ip, options,
+			    &brand_wants_wait, &ret) == 0) {
+				mutex_exit(&pidlock);
+				return (ret);
+			}
+
+			if (pp->p_child == NULL) {
+				goto no_real_children;
+			}
+		}
+
+		/*
+		 * Look for interesting children in the newstate list.
+		 */
+		VERIFY(pp->p_child != NULL);
 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
 				continue;
@@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 
@@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * Wow! None of the threads on the p_sibling_ns list were
 		 * interesting threads. Check all the kids!
 		 */
-		found = 0;
 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
 			if (idtype == P_PID && id != cp->p_pid)
 				continue;
 			if (idtype == P_PGID && id != cp->p_pgrp)
 				continue;
+			if (PROC_IS_BRANDED(pp)) {
+				if (BROP(pp)->b_wait_filter != NULL &&
+				    BROP(pp)->b_wait_filter(pp, cp) == B_FALSE)
+					continue;
+			}
 
 			switch (cp->p_wcode) {
 			case CLD_TRAPPED:
@@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 				break;
 		}
 
+no_real_children:
 		/*
 		 * If we found no interesting processes at all,
 		 * break out and return ECHILD.
 		 */
-		if (found + proc_gone == 0)
+		if (!brand_wants_wait && (found + proc_gone == 0))
 			break;
 
 		if (options & WNOHANG) {
@@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
 		 * change state while we wait, we don't wait at all.
 		 * Get out with ECHILD according to SVID.
 		 */
-		if (found == proc_gone)
+		if (!brand_wants_wait && (found == proc_gone))
 			break;
 
 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
@@ -1226,6 +1354,12 @@ freeproc(proc_t *p)
 		p->p_killsqp = NULL;
 	}
 
+	/* Clear any remaining brand data */
+	if (PROC_IS_BRANDED(p)) {
+		brand_clearbrand(p, B_FALSE);
+	}
+
+
 	prfree(p);	/* inform /proc */
 
 	/*
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 76eddd4e50..bfee77130d 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -852,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip)
 	 */
 	cfip->fi_nfiles = nfiles = flist_minsize(pfip);
 
-	cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
+	cfip->fi_list = nfiles == 0 ? NULL :
+	    kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP);
 
 	for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles;
 	    fd++, pufp++, cufp++) {
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index fe3a362fa7..4a9c82d2db 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -696,7 +696,7 @@ fork_fail(proc_t *cp)
 	if (PTOU(curproc)->u_cwd)
 		refstr_rele(PTOU(curproc)->u_cwd);
 	if (PROC_IS_BRANDED(cp)) {
-		brand_clearbrand(cp, B_TRUE);
+		brand_clearbrand(cp, B_FALSE);
 	}
 }
 
@@ -745,7 +745,7 @@ forklwp_fail(proc_t *p)
 			kmem_free(t->t_door, sizeof (door_data_t));
 			t->t_door = NULL;
 		}
-		lwp_ctmpl_clear(ttolwp(t));
+		lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 
 		/*
 		 * Remove the thread from the all threads list.
@@ -1004,6 +1004,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_t1_lgrpid = LGRP_NONE;
 	cp->p_tr_lgrpid = LGRP_NONE;
 
+	/* Default to native brand initially */
+	cp->p_brand = &native_brand;
+
 	if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 		if (nproc == v.v_proc) {
 			CPU_STATS_ADDQ(CPU, sys, procovf, 1);
@@ -1071,9 +1074,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
 	cp->p_sessp = pp->p_sessp;
 	sess_hold(pp);
-	cp->p_brand = pp->p_brand;
-	if (PROC_IS_BRANDED(pp))
-		BROP(pp)->b_copy_procdata(cp, pp);
 	cp->p_bssbase = pp->p_bssbase;
 	cp->p_brkbase = pp->p_brkbase;
 	cp->p_brksize = pp->p_brksize;
@@ -1153,6 +1153,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags)
 	mutex_exit(&cp->p_lock);
 	mutex_exit(&pidlock);
 
+	if (PROC_IS_BRANDED(pp)) {
+		/*
+		 * The only reason why process branding should fail is when
+		 * the procedure is complicated by multiple LWPs on the scene.
+		 * With an LWP count of 0, this newly allocated process has no
+		 * reason to fail branding.
+		 */
+		VERIFY0(brand_setbrand(cp, B_FALSE));
+
+		BROP(pp)->b_copy_procdata(cp, pp);
+	}
+
 	avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
 	    offsetof(contract_t, ct_ctlist));
 
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index f5e92cfd94..0c4c0bcad6 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -19,7 +19,10 @@
  * CDDL HEADER END
  */
 
-/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
+/*
+ * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
 
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
@@ -52,6 +55,7 @@
 #include <sys/fcntl.h>
 #include <sys/lwpchan_impl.h>
 #include <sys/nbmlock.h>
+#include <sys/brand.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -522,6 +526,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 	return (0);
 }
 
+caddr_t
+map_userlimit(proc_t *pp, struct as *as, int flags)
+{
+	if (flags & _MAP_LOW32) {
+		if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
+			return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
+		} else {
+			return ((caddr_t)_userlimit32);
+		}
+	}
+
+	return (as->a_userlimit);
+}
+
 
 /*
  * Used for MAP_ANON - fast way to get anonymous pages
@@ -537,8 +555,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		return (EACCES);
 
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -547,9 +563,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
 
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(as->a_proc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
@@ -717,8 +732,6 @@ smmap_common(caddr_t *addrp, size_t len,
 	 * If the user specified an address, do some simple checks here
 	 */
 	if ((flags & MAP_FIXED) != 0) {
-		caddr_t userlimit;
-
 		/*
 		 * Use the user address.  First verify that
 		 * the address to be used is page aligned.
@@ -726,10 +739,8 @@ smmap_common(caddr_t *addrp, size_t len,
 		 */
 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 			return (EINVAL);
-
-		userlimit = flags & _MAP_LOW32 ?
-		    (caddr_t)USERLIMIT32 : as->a_userlimit;
-		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
+		switch (valid_usr_range(*addrp, len, uprot, as,
+		    map_userlimit(curproc, as, flags))) {
 		case RANGE_OKAY:
 			break;
 		case RANGE_BADPROT:
diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c
deleted file mode 100644
index 2dad0cb940..0000000000
--- a/usr/src/uts/common/os/id_space.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/id_space.h>
-#include <sys/debug.h>
-
-/*
- * ID Spaces
- *
- *   The id_space_t provides a simple implementation of a managed range of
- *   integer identifiers using a vmem arena.  An ID space guarantees that the
- *   next identifer returned by an allocation is larger than the previous one,
- *   unless there are no larger slots remaining in the range.  In this case,
- *   the ID space will return the first available slot in the lower part of the
- *   range (viewing the previous identifier as a partitioning element).  If no
- *   slots are available, id_alloc()/id_allocff() will sleep until an
- *   identifier becomes available.  Accordingly, id_space allocations must be
- *   initiated from contexts where sleeping is acceptable.  id_alloc_nosleep()/
- *   id_allocff_nosleep() will return -1 if no slots are available or if the
- *   system is low on memory.  If id_alloc_nosleep() fails, callers should
- *   not try to extend the ID space.  This is to avoid making a possible
- *   low-memory situation worse.
- *
- *   As an ID space is designed for representing a range of id_t's, there
- *   is a preexisting maximal range: [0, MAXUID].  ID space requests outside
- *   that range will fail on a DEBUG kernel.  The id_allocff*() functions
- *   return the first available id, and should be used when there is benefit
- *   to having a compact allocated range.
- *
- *   (Presently, the id_space_t abstraction supports only direct allocations; ID
- *   reservation, in which an ID is allocated but placed in a internal
- *   dictionary for later use, should be added when a consuming subsystem
- *   arrives.)
- */
-
-#define	ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1))
-#define	ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1))
-
-/*
- * Create an arena to represent the range [low, high).
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_space_t *
-id_space_create(const char *name, id_t low, id_t high)
-{
-	ASSERT(low >= 0);
-	ASSERT(low < high);
-
-	return (vmem_create(name, ID_TO_ADDR(low), high - low, 1,
-	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER));
-}
-
-/*
- * Destroy a previously created ID space.
- * No restrictions on caller's context.
- */
-void
-id_space_destroy(id_space_t *isp)
-{
-	vmem_destroy(isp);
-}
-
-void
-id_space_extend(id_space_t *isp, id_t low, id_t high)
-{
-	(void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP);
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_alloc(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space.
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_alloc_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT.
- * Caller must be in a context in which VM_SLEEP is legal.
- */
-id_t
-id_allocff(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate an id_t from specified ID space using FIRSTFIT
- * Returns -1 on failure (see module block comments for more information on
- * failure modes).
- */
-id_t
-id_allocff_nosleep(id_space_t *isp)
-{
-	return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT)));
-}
-
-/*
- * Allocate a specific identifier if possible, returning the id if
- * successful, or -1 on failure.
- */
-id_t
-id_alloc_specific_nosleep(id_space_t *isp, id_t id)
-{
-	void *minaddr = ID_TO_ADDR(id);
-	void *maxaddr = ID_TO_ADDR(id + 1);
-
-	/*
-	 * Note that even though we're vmem_free()ing this later, it
-	 * should be OK, since there's no quantum cache.
-	 */
-	return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0,
-	    minaddr, maxaddr, VM_NOSLEEP)));
-}
-
-/*
- * Free a previously allocated ID.
- * No restrictions on caller's context.
- */
-void
-id_free(id_space_t *isp, id_t id)
-{
-	vmem_free(isp, ID_TO_ADDR(id), 1);
-}
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index cc53c2fb76..734fa910e4 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -158,10 +159,22 @@
  *               find known objects and is about to free it, or
  *            c) the client has freed the object.
  *            In all these cases (a, b, and c) kmem frees the new object (the
- *            unused copy destination) and searches for the old object in the
- *            magazine layer. If found, the object is removed from the magazine
- *            layer and freed to the slab layer so it will no longer hold the
- *            slab hostage.
+ *            unused copy destination).  In the first case, the object is in
+ *            use and the correct action is that for LATER; in the latter two
+ *            cases, we know that the object is either freed or about to be
+ *            freed, in which case it is either already in a magazine or about
+ *            to be in one.  In these cases, we know that the object will either
+ *            be reallocated and reused, or it will end up in a full magazine
+ *            that will be reaped (thereby liberating the slab).  Because it
+ *            is prohibitively expensive to differentiate these cases, and
+ *            because the defrag code is executed when we're low on memory
+ *            (thereby biasing the system to reclaim full magazines) we treat
+ *            all DONT_KNOW cases as LATER and rely on cache reaping to
+ *            generally clean up full magazines.  While we take the same action
+ *            for these cases, we maintain their semantic distinction:  if
+ *            defragmentation is not occurring, it is useful to know if this
+ *            is due to objects in use (LATER) or objects in an unknown state
+ *            of transition (DONT_KNOW).
  *
  * 2.3 Object States
  *
@@ -284,10 +297,10 @@
  * view of the slab layer, making it a candidate for the move callback. Most
  * objects unrecognized by the client in the move callback fall into this
  * category and are cheaply distinguished from known objects by the test
- * described earlier. Since recognition is cheap for the client, and searching
- * magazines is expensive for kmem, kmem defers searching until the client first
- * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
- * elsewhere does what it can to avoid bothering the client unnecessarily.
+ * described earlier. Because searching magazines is prohibitively expensive
+ * for kmem, clients that do not mark freed objects (and therefore return
+ * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
+ * efficacy reduced.
  *
  * Invalidating the designated pointer member before freeing the object marks
  * the object to be avoided in the callback, and conversely, assigning a valid
@@ -997,6 +1010,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
 size_t kmem_content_log_size;	/* content log size [2% of memory] */
 size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */
 size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */
+size_t kmem_zerosized_log_size;	/* zero-sized log [4 pages per CPU] */
 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
 size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */
 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
@@ -1004,6 +1018,14 @@ int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */
 size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */
 size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */
 
+#ifdef DEBUG
+int kmem_warn_zerosized = 1;	/* whether to warn on zero-sized KM_SLEEP */
+#else
+int kmem_warn_zerosized = 0;	/* whether to warn on zero-sized KM_SLEEP */
+#endif
+
+int kmem_panic_zerosized = 0;	/* whether to panic on zero-sized KM_SLEEP */
+
 #ifdef _LP64
 size_t	kmem_max_cached = KMEM_BIG_MAXBUF;	/* maximum kmem_alloc cache */
 #else
@@ -1037,21 +1059,7 @@ static vmem_t		*kmem_default_arena;
 static vmem_t		*kmem_firewall_va_arena;
 static vmem_t		*kmem_firewall_arena;
 
-/*
- * Define KMEM_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef	DEBUG
-#define	KMEM_STATS
-#endif	/* DEBUG */
-
-#ifdef	KMEM_STATS
-#define	KMEM_STAT_ADD(stat)			((stat)++)
-#define	KMEM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
-#else
-#define	KMEM_STAT_ADD(stat)			/* nothing */
-#define	KMEM_STAT_COND_ADD(cond, stat)		/* nothing */
-#endif	/* KMEM_STATS */
+static int		kmem_zerosized;		/* # of zero-sized allocs */
 
 /*
  * kmem slab consolidator thresholds (tunables)
@@ -1070,47 +1078,6 @@ size_t kmem_reclaim_max_slabs = 1;
  */
 size_t kmem_reclaim_scan_range = 12;
 
-#ifdef	KMEM_STATS
-static struct {
-	uint64_t kms_callbacks;
-	uint64_t kms_yes;
-	uint64_t kms_no;
-	uint64_t kms_later;
-	uint64_t kms_dont_need;
-	uint64_t kms_dont_know;
-	uint64_t kms_hunt_found_mag;
-	uint64_t kms_hunt_found_slab;
-	uint64_t kms_hunt_alloc_fail;
-	uint64_t kms_hunt_lucky;
-	uint64_t kms_notify;
-	uint64_t kms_notify_callbacks;
-	uint64_t kms_disbelief;
-	uint64_t kms_already_pending;
-	uint64_t kms_callback_alloc_fail;
-	uint64_t kms_callback_taskq_fail;
-	uint64_t kms_endscan_slab_dead;
-	uint64_t kms_endscan_slab_destroyed;
-	uint64_t kms_endscan_nomem;
-	uint64_t kms_endscan_refcnt_changed;
-	uint64_t kms_endscan_nomove_changed;
-	uint64_t kms_endscan_freelist;
-	uint64_t kms_avl_update;
-	uint64_t kms_avl_noupdate;
-	uint64_t kms_no_longer_reclaimable;
-	uint64_t kms_notify_no_longer_reclaimable;
-	uint64_t kms_notify_slab_dead;
-	uint64_t kms_notify_slab_destroyed;
-	uint64_t kms_alloc_fail;
-	uint64_t kms_constructor_fail;
-	uint64_t kms_dead_slabs_freed;
-	uint64_t kms_defrags;
-	uint64_t kms_scans;
-	uint64_t kms_scan_depot_ws_reaps;
-	uint64_t kms_debug_reaps;
-	uint64_t kms_debug_scans;
-} kmem_move_stats;
-#endif	/* KMEM_STATS */
-
 /* consolidator knobs */
 static boolean_t kmem_move_noreap;
 static boolean_t kmem_move_blocked;
@@ -1141,6 +1108,7 @@ kmem_log_header_t	*kmem_transaction_log;
 kmem_log_header_t	*kmem_content_log;
 kmem_log_header_t	*kmem_failure_log;
 kmem_log_header_t	*kmem_slab_log;
+kmem_log_header_t	*kmem_zerosized_log;
 
 static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
 
@@ -1921,15 +1889,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf)
 		cp->cache_complete_slab_count--;
 		avl_add(&cp->cache_partial_slabs, sp);
 	} else {
-#ifdef	DEBUG
-		if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
-		} else {
-			KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
-		}
-#else
 		(void) avl_update_gt(&cp->cache_partial_slabs, sp);
-#endif
 	}
 
 	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
@@ -2941,8 +2901,33 @@ kmem_alloc(size_t size, int kmflag)
 		/* fall through to kmem_cache_alloc() */
 
 	} else {
-		if (size == 0)
+		if (size == 0) {
+			if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
+				return (NULL);
+
+			/*
+			 * If this is a sleeping allocation or one that has
+			 * been specified to panic on allocation failure, we
+			 * consider it to be deprecated behavior to allocate
+			 * 0 bytes.  If we have been configured to panic under
+			 * this condition, we panic; if to warn, we warn -- and
+			 * regardless, we log to the kmem_zerosized_log that
+			 * that this condition has occurred (which gives us
+			 * enough information to be able to debug it).
+			 */
+			if (kmem_panic && kmem_panic_zerosized)
+				panic("attempted to kmem_alloc() size of 0");
+
+			if (kmem_warn_zerosized) {
+				cmn_err(CE_WARN, "kmem_alloc(): sleeping "
+				    "allocation with size of 0; "
+				    "see kmem_zerosized_log for details");
+			}
+
+			kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
+
 			return (NULL);
+		}
 
 		buf = vmem_alloc(kmem_oversize_arena, size,
 		    kmflag & KM_VMFLAGS);
@@ -3556,7 +3541,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw)
 		kmcp->kmc_move_later.value.ui64		= kd->kmd_later;
 		kmcp->kmc_move_dont_need.value.ui64	= kd->kmd_dont_need;
 		kmcp->kmc_move_dont_know.value.ui64	= kd->kmd_dont_know;
-		kmcp->kmc_move_hunt_found.value.ui64	= kd->kmd_hunt_found;
+		kmcp->kmc_move_hunt_found.value.ui64	= 0;
 		kmcp->kmc_move_slabs_freed.value.ui64	= kd->kmd_slabs_freed;
 		kmcp->kmc_defrag.value.ui64		= kd->kmd_defrags;
 		kmcp->kmc_scan.value.ui64		= kd->kmd_scans;
@@ -4127,7 +4112,8 @@ kmem_cache_destroy(kmem_cache_t *cp)
 
 	if (kmem_taskq != NULL)
 		taskq_wait(kmem_taskq);
-	if (kmem_move_taskq != NULL)
+
+	if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
 		taskq_wait(kmem_move_taskq);
 
 	kmem_cache_magazine_purge(cp);
@@ -4465,8 +4451,8 @@ kmem_init(void)
 	}
 
 	kmem_failure_log = kmem_log_init(kmem_failure_log_size);
-
 	kmem_slab_log = kmem_log_init(kmem_slab_log_size);
+	kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
 
 	/*
 	 * Initialize STREAMS message caches so allocb() is available.
@@ -4654,94 +4640,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)
 	    (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
 }
 
-static void *
-kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
-    void *tbuf)
-{
-	int i;		/* magazine round index */
-
-	for (i = 0; i < n; i++) {
-		if (buf == m->mag_round[i]) {
-			if (cp->cache_flags & KMF_BUFTAG) {
-				(void) kmem_cache_free_debug(cp, tbuf,
-				    caller());
-			}
-			m->mag_round[i] = tbuf;
-			return (buf);
-		}
-	}
-
-	return (NULL);
-}
-
-/*
- * Hunt the magazine layer for the given buffer. If found, the buffer is
- * removed from the magazine layer and returned, otherwise NULL is returned.
- * The state of the returned buffer is freed and constructed.
- */
-static void *
-kmem_hunt_mags(kmem_cache_t *cp, void *buf)
-{
-	kmem_cpu_cache_t *ccp;
-	kmem_magazine_t	*m;
-	int cpu_seqid;
-	int n;		/* magazine rounds */
-	void *tbuf;	/* temporary swap buffer */
-
-	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
-
-	/*
-	 * Allocated a buffer to swap with the one we hope to pull out of a
-	 * magazine when found.
-	 */
-	tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
-	if (tbuf == NULL) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
-		return (NULL);
-	}
-	if (tbuf == buf) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
-		if (cp->cache_flags & KMF_BUFTAG) {
-			(void) kmem_cache_free_debug(cp, buf, caller());
-		}
-		return (buf);
-	}
-
-	/* Hunt the depot. */
-	mutex_enter(&cp->cache_depot_lock);
-	n = cp->cache_magtype->mt_magsize;
-	for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&cp->cache_depot_lock);
-			return (buf);
-		}
-	}
-	mutex_exit(&cp->cache_depot_lock);
-
-	/* Hunt the per-CPU magazines. */
-	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
-		ccp = &cp->cache_cpu[cpu_seqid];
-
-		mutex_enter(&ccp->cc_lock);
-		m = ccp->cc_loaded;
-		n = ccp->cc_rounds;
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&ccp->cc_lock);
-			return (buf);
-		}
-		m = ccp->cc_ploaded;
-		n = ccp->cc_prounds;
-		if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
-			mutex_exit(&ccp->cc_lock);
-			return (buf);
-		}
-		mutex_exit(&ccp->cc_lock);
-	}
-
-	kmem_cache_free(cp, tbuf);
-	return (NULL);
-}
-
 /*
  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
  * or when the buffer is freed.
@@ -4805,7 +4703,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
  * NO		kmem frees the new buffer, marks the slab of the old buffer
  *              non-reclaimable to avoid bothering the client again
  * LATER	kmem frees the new buffer, increments slab_later_count
- * DONT_KNOW	kmem frees the new buffer, searches mags for the old buffer
+ * DONT_KNOW	kmem frees the new buffer
  * DONT_NEED	kmem frees both the old buffer and the new buffer
  *
  * The pending callback argument now being processed contains both of the
@@ -4839,19 +4737,14 @@ kmem_move_buffer(kmem_move_t *callback)
 	 * another buffer on the same slab.
 	 */
 	if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
-		KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
-		    kmem_move_stats.kms_notify_no_longer_reclaimable);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
 	}
 
 	/*
-	 * Hunting magazines is expensive, so we'll wait to do that until the
-	 * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
-	 * is cheap, so we might as well do that here in case we can avoid
-	 * bothering the client.
+	 * Checking the slab layer is easy, so we might as well do that here
+	 * in case we can avoid bothering the client.
 	 */
 	mutex_enter(&cp->cache_lock);
 	free_on_slab = (kmem_slab_allocated(cp, sp,
@@ -4859,7 +4752,6 @@ kmem_move_buffer(kmem_move_t *callback)
 	mutex_exit(&cp->cache_lock);
 
 	if (free_on_slab) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
@@ -4871,7 +4763,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		 */
 		if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
 		    KM_NOSLEEP, 1, caller()) != 0) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
 			kmem_move_end(cp, callback);
 			return;
 		}
@@ -4879,15 +4770,11 @@ kmem_move_buffer(kmem_move_t *callback)
 	    cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
 	    KM_NOSLEEP) != 0) {
 		atomic_inc_64(&cp->cache_alloc_fail);
-		KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
 		kmem_slab_free(cp, callback->kmm_to_buf);
 		kmem_move_end(cp, callback);
 		return;
 	}
 
-	KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
-	KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
-	    kmem_move_stats.kms_notify_callbacks);
 	cp->cache_defrag->kmd_callbacks++;
 	cp->cache_defrag->kmd_thread = curthread;
 	cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
@@ -4905,7 +4792,6 @@ kmem_move_buffer(kmem_move_t *callback)
 	cp->cache_defrag->kmd_to_buf = NULL;
 
 	if (response == KMEM_CBRC_YES) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_yes);
 		cp->cache_defrag->kmd_yes++;
 		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
 		/* slab safe to access until kmem_move_end() */
@@ -4920,14 +4806,12 @@ kmem_move_buffer(kmem_move_t *callback)
 
 	switch (response) {
 	case KMEM_CBRC_NO:
-		KMEM_STAT_ADD(kmem_move_stats.kms_no);
 		cp->cache_defrag->kmd_no++;
 		mutex_enter(&cp->cache_lock);
 		kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_LATER:
-		KMEM_STAT_ADD(kmem_move_stats.kms_later);
 		cp->cache_defrag->kmd_later++;
 		mutex_enter(&cp->cache_lock);
 		if (!KMEM_SLAB_IS_PARTIAL(sp)) {
@@ -4936,7 +4820,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		}
 
 		if (++sp->slab_later_count >= KMEM_DISBELIEF) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
 			kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
 		} else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
 			sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
@@ -4945,7 +4828,6 @@ kmem_move_buffer(kmem_move_t *callback)
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_DONT_NEED:
-		KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
 		cp->cache_defrag->kmd_dont_need++;
 		kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
 		if (sp->slab_refcnt == 0)
@@ -4955,19 +4837,21 @@ kmem_move_buffer(kmem_move_t *callback)
 		mutex_exit(&cp->cache_lock);
 		break;
 	case KMEM_CBRC_DONT_KNOW:
-		KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
+		/*
+		 * If we don't know if we can move this buffer or not, we'll
+		 * just assume that we can't:  if the buffer is in fact free,
+		 * then it is sitting in one of the per-CPU magazines or in
+		 * a full magazine in the depot layer.  Either way, because
+		 * defrag is induced in the same logic that reaps a cache,
+		 * it's likely that full magazines will be returned to the
+		 * system soon (thereby accomplishing what we're trying to
+		 * accomplish here: return those magazines to their slabs).
+		 * Given this, any work that we might do now to locate a buffer
+		 * in a magazine is wasted (and expensive!) work; we bump
+		 * a counter in this case and otherwise assume that we can't
+		 * move it.
+		 */
 		cp->cache_defrag->kmd_dont_know++;
-		if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
-			KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
-			cp->cache_defrag->kmd_hunt_found++;
-			kmem_slab_free_constructed(cp, callback->kmm_from_buf,
-			    B_TRUE);
-			if (sp->slab_refcnt == 0)
-				cp->cache_defrag->kmd_slabs_freed++;
-			mutex_enter(&cp->cache_lock);
-			kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
-			mutex_exit(&cp->cache_lock);
-		}
 		break;
 	default:
 		panic("'%s' (%p) unexpected move callback response %d\n",
@@ -4992,10 +4876,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 	ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
 
 	callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
-	if (callback == NULL) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
+
+	if (callback == NULL)
 		return (B_FALSE);
-	}
 
 	callback->kmm_from_slab = sp;
 	callback->kmm_from_buf = buf;
@@ -5020,7 +4903,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 			pending->kmm_flags |= KMM_DESPERATE;
 		}
 		mutex_exit(&cp->cache_lock);
-		KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
 		kmem_cache_free(kmem_move_cache, callback);
 		return (B_TRUE);
 	}
@@ -5034,7 +4916,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
 
 	if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
 	    callback, TQ_NOSLEEP)) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
 		mutex_enter(&cp->cache_lock);
 		avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
 		mutex_exit(&cp->cache_lock);
@@ -5080,7 +4961,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
 			cp->cache_slab_destroy++;
 			mutex_exit(&cp->cache_lock);
 			kmem_slab_destroy(cp, sp);
-			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
 			mutex_enter(&cp->cache_lock);
 		}
 	}
@@ -5225,8 +5105,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 					 * pending move completes.
 					 */
 					list_insert_head(deadlist, sp);
-					KMEM_STAT_ADD(kmem_move_stats.
-					    kms_endscan_slab_dead);
 					return (-1);
 				}
 
@@ -5241,10 +5119,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				cp->cache_slab_destroy++;
 				mutex_exit(&cp->cache_lock);
 				kmem_slab_destroy(cp, sp);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_dead_slabs_freed);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_endscan_slab_destroyed);
 				mutex_enter(&cp->cache_lock);
 				/*
 				 * Since we can't pick up the scan where we left
@@ -5260,8 +5134,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				 * for the request and say nothing about the
 				 * number of reclaimable slabs.
 				 */
-				KMEM_STAT_COND_ADD(s < max_slabs,
-				    kmem_move_stats.kms_endscan_nomem);
 				return (-1);
 			}
 
@@ -5277,16 +5149,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 				 * destination buffer on the same slab. In that
 				 * case, we're not interested in counting it.
 				 */
-				KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
-				    (s < max_slabs),
-				    kmem_move_stats.kms_endscan_refcnt_changed);
 				return (-1);
 			}
-			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
-				KMEM_STAT_COND_ADD(s < max_slabs,
-				    kmem_move_stats.kms_endscan_nomove_changed);
+			if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
 				return (-1);
-			}
 
 			/*
 			 * Generating a move request allocates a destination
@@ -5313,11 +5179,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
 	}
 end_scan:
 
-	KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
-	    (s < max_slabs) &&
-	    (sp == avl_first(&cp->cache_partial_slabs)),
-	    kmem_move_stats.kms_endscan_freelist);
-
 	return (s);
 }
 
@@ -5377,8 +5238,6 @@ kmem_cache_move_notify_task(void *arg)
 			    &cp->cache_defrag->kmd_moves_pending)) {
 				list_insert_head(deadlist, sp);
 				mutex_exit(&cp->cache_lock);
-				KMEM_STAT_ADD(kmem_move_stats.
-				    kms_notify_slab_dead);
 				return;
 			}
 
@@ -5386,9 +5245,6 @@ kmem_cache_move_notify_task(void *arg)
 			cp->cache_slab_destroy++;
 			mutex_exit(&cp->cache_lock);
 			kmem_slab_destroy(cp, sp);
-			KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
-			KMEM_STAT_ADD(kmem_move_stats.
-			    kms_notify_slab_destroyed);
 			return;
 		}
 	} else {
@@ -5402,7 +5258,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
 {
 	kmem_move_notify_args_t *args;
 
-	KMEM_STAT_ADD(kmem_move_stats.kms_notify);
 	args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
 	if (args != NULL) {
 		args->kmna_cache = cp;
@@ -5425,7 +5280,6 @@ kmem_cache_defrag(kmem_cache_t *cp)
 	n = avl_numnodes(&cp->cache_partial_slabs);
 	if (n > 1) {
 		/* kmem_move_buffers() drops and reacquires cache_lock */
-		KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
 		cp->cache_defrag->kmd_defrags++;
 		(void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
 	}
@@ -5524,7 +5378,6 @@ kmem_cache_scan(kmem_cache_t *cp)
 		 *
 		 * kmem_move_buffers() drops and reacquires cache_lock.
 		 */
-		KMEM_STAT_ADD(kmem_move_stats.kms_scans);
 		kmd->kmd_scans++;
 		slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
 		    kmem_reclaim_max_slabs, 0);
@@ -5565,12 +5418,9 @@ kmem_cache_scan(kmem_cache_t *cp)
 			if (!kmem_move_noreap &&
 			    ((debug_rand % kmem_mtb_reap) == 0)) {
 				mutex_exit(&cp->cache_lock);
-				KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
 				kmem_cache_reap(cp);
 				return;
 			} else if ((debug_rand % kmem_mtb_move) == 0) {
-				KMEM_STAT_ADD(kmem_move_stats.kms_scans);
-				KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
 				kmd->kmd_scans++;
 				(void) kmem_move_buffers(cp,
 				    kmem_reclaim_scan_range, 1, KMM_DEBUG);
@@ -5581,8 +5431,6 @@ kmem_cache_scan(kmem_cache_t *cp)
 
 	mutex_exit(&cp->cache_lock);
 
-	if (reap) {
-		KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
+	if (reap)
 		kmem_depot_ws_reap(cp);
-	}
 }
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 149f5f8a88..cbc4fa0000 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2013 Gary Mills
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -249,8 +250,7 @@ log_init(void)
 	 */
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
-	    "All rights reserved.\n");
+	printf("Copyright (c) 2010-2016, Joyent Inc. All rights reserved.\n");
 #ifdef DEBUG
 	printf("DEBUG enabled\n");
 #endif
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index feb8e76c42..3aaf2c746c 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -57,6 +57,8 @@
 #include <sys/lgrp.h>
 #include <sys/rctl.h>
 #include <sys/contract_impl.h>
+#include <sys/contract/process.h>
+#include <sys/contract/process_impl.h>
 #include <sys/cpc_impl.h>
 #include <sys/sdt.h>
 #include <sys/cmn_err.h>
@@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	ret_tidhash_t *ret_tidhash = NULL;
 	int i;
 	int rctlfail = 0;
-	boolean_t branded = 0;
+	void *brand_data = NULL;
 	struct ctxop *ctx = NULL;
 
 	ASSERT(cid != sysdccid);	/* system threads must start in SYS */
@@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p,
 	 */
 	lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 
+	/*
+	 * If necessary, speculatively allocate lwp brand data.  This is done
+	 * ahead of time so p_lock need not be dropped during lwp branding.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) {
+		if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) {
+			mutex_enter(&p->p_lock);
+			err = 1;
+			atomic_inc_32(&p->p_zone->zone_ffmisc);
+			goto error;
+		}
+	}
+
 	mutex_enter(&p->p_lock);
 grow:
 	/*
@@ -630,18 +645,6 @@ grow:
 		} while (lwp_hash_lookup(p, t->t_tid) != NULL);
 	}
 
-	/*
-	 * If this is a branded process, let the brand do any necessary lwp
-	 * initialization.
-	 */
-	if (PROC_IS_BRANDED(p)) {
-		if (BROP(p)->b_initlwp(lwp)) {
-			err = 1;
-			atomic_inc_32(&p->p_zone->zone_ffmisc);
-			goto error;
-		}
-		branded = 1;
-	}
 
 	if (t->t_tid == 1) {
 		kpreempt_disable();
@@ -654,7 +657,6 @@ grow:
 		}
 	}
 
-	p->p_lwpcnt++;
 	t->t_waitfor = -1;
 
 	/*
@@ -696,8 +698,27 @@ grow:
 	t->t_post_sys = 1;
 
 	/*
+	 * Perform lwp branding
+	 *
+	 * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be
+	 * continuously held between when the tidhash is sized and when the lwp
+	 * is inserted into it.  Operations requiring p->p_lock to be
+	 * temporarily dropped can be performed in b_initlwp_post.
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		BROP(p)->b_initlwp(lwp, brand_data);
+		/*
+		 * The b_initlwp hook is expected to consume any preallocated
+		 * brand_data in a way that prepares it for deallocation by the
+		 * b_freelwp hook.
+		 */
+		brand_data = NULL;
+	}
+
+	/*
 	 * Insert the new thread into the list of all threads.
 	 */
+	p->p_lwpcnt++;
 	if ((tx = p->p_tlist) == NULL) {
 		t->t_back = t;
 		t->t_forw = t;
@@ -718,6 +739,13 @@ grow:
 	lep->le_start = t->t_start;
 	lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1);
 
+	/*
+	 * Complete lwp branding
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) {
+		BROP(p)->b_initlwp_post(lwp);
+	}
+
 	if (state == TS_RUN) {
 		/*
 		 * We set the new lwp running immediately.
@@ -753,8 +781,9 @@ error:
 		if (cid != NOCLASS && bufp != NULL)
 			CL_FREE(cid, bufp);
 
-		if (branded)
-			BROP(p)->b_freelwp(lwp);
+		if (brand_data != NULL) {
+			BROP(p)->b_lwpdata_free(brand_data);
+		}
 
 		mutex_exit(&p->p_lock);
 		t->t_state = TS_FREE;
@@ -827,8 +856,25 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]);
+		ct_template_t *tmpl = src->lwp_ct_active[i];
+
+		/*
+		 * If the process contract template is setup to be preserved
+		 * across exec, then perform an implicit template_clear now
+		 * since we're forking. This ensures that future children of
+		 * this child will remain in the same contract unless they're
+		 * explicitly setup differently.
+		 */
+		if (i == CTT_PROCESS && tmpl != NULL) {
+			ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+			if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+				tmpl = NULL;
+		}
+
+		dst->lwp_ct_active[i] = ctmpl_dup(tmpl);
 		dst->lwp_ct_latest[i] = NULL;
+
 	}
 }
 
@@ -836,21 +882,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src)
  * Clear an LWP's contract template state.
  */
 void
-lwp_ctmpl_clear(klwp_t *lwp)
+lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec)
 {
 	ct_template_t *tmpl;
 	int i;
 
 	for (i = 0; i < ct_ntypes; i++) {
-		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
-			ctmpl_free(tmpl);
-			lwp->lwp_ct_active[i] = NULL;
-		}
-
 		if (lwp->lwp_ct_latest[i] != NULL) {
 			contract_rele(lwp->lwp_ct_latest[i]);
 			lwp->lwp_ct_latest[i] = NULL;
 		}
+
+		if ((tmpl = lwp->lwp_ct_active[i]) != NULL) {
+			/*
+			 * If we're exec-ing a new program and the process
+			 * contract template is setup to be preserved across
+			 * exec, then don't clear it.
+			 */
+			if (is_exec && i == CTT_PROCESS) {
+				ctmpl_process_t *ctp = tmpl->ctmpl_data;
+
+				if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0)
+					continue;
+			}
+
+			ctmpl_free(tmpl);
+			lwp->lwp_ct_active[i] = NULL;
+		}
 	}
 }
 
@@ -891,13 +949,6 @@ lwp_exit(void)
 	if (t->t_upimutex != NULL)
 		upimutex_cleanup();
 
-	/*
-	 * Perform any brand specific exit processing, then release any
-	 * brand data associated with the lwp
-	 */
-	if (PROC_IS_BRANDED(p))
-		BROP(p)->b_lwpexit(lwp);
-
 	lwp_pcb_exit();
 
 	mutex_enter(&p->p_lock);
@@ -941,6 +992,18 @@ lwp_exit(void)
 	DTRACE_PROC(lwp__exit);
 
 	/*
+	 * Perform any brand specific exit processing, then release any
+	 * brand data associated with the lwp
+	 */
+	if (PROC_IS_BRANDED(p)) {
+		mutex_exit(&p->p_lock);
+		BROP(p)->b_lwpexit(lwp);
+		BROP(p)->b_freelwp(lwp);
+		mutex_enter(&p->p_lock);
+		prbarrier(p);
+	}
+
+	/*
 	 * If the lwp is a detached lwp or if the process is exiting,
 	 * remove (lwp_hash_out()) the lwp from the lwp directory.
 	 * Otherwise null out the lwp's le_thread pointer in the lwp
@@ -1101,7 +1164,7 @@ lwp_cleanup(void)
 	}
 	kpreempt_enable();
 
-	lwp_ctmpl_clear(ttolwp(t));
+	lwp_ctmpl_clear(ttolwp(t), B_FALSE);
 }
 
 int
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 7afc1cfe00..dda0b3e4a6 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -27,7 +27,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args)
 	int error = 0, count = 0;
 	proc_t *p = ttoproc(curthread);
 	klwp_t *lwp = ttolwp(curthread);
-	int brand_action;
+	int brand_action = EBA_NONE;
 
 	if (args == NULL)
 		args = "";
@@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args)
 	 */
 	sigemptyset(&curthread->t_hold);
 
-	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
+	/*
+	 * Only instruct exec_common to brand the process if necessary.  It is
+	 * possible that the init process is already properly branded due to the
+	 * proc_exit -> restart_init -> exec_init call chain.
+	 */
+	if (ZONE_IS_BRANDED(p->p_zone) &&
+	    p->p_brand != p->p_zone->zone_brand) {
+		brand_action = EBA_BRAND;
+	}
 again:
 	error = exec_common((const char *)(uintptr_t)exec_fnamep,
 	    (const char **)(uintptr_t)uap, NULL, brand_action);
diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c
index 928c6b3bb4..66994321f7 100644
--- a/usr/src/uts/common/os/msacct.c
+++ b/usr/src/uts/common/os/msacct.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -566,27 +567,18 @@ cpu_update_pct(kthread_t *t, hrtime_t newtime)
 	 */
 
 	do {
-		if (T_ONPROC(t) && t->t_waitrq == 0) {
-			hrlb = t->t_hrtime;
+		pctcpu = t->t_pctcpu;
+		hrlb = t->t_hrtime;
+		delta = newtime - hrlb;
+		if (delta < 0) {
+			newtime = gethrtime_unscaled();
 			delta = newtime - hrlb;
-			if (delta < 0) {
-				newtime = gethrtime_unscaled();
-				delta = newtime - hrlb;
-			}
-			t->t_hrtime = newtime;
-			scalehrtime(&delta);
-			pctcpu = t->t_pctcpu;
+		}
+		t->t_hrtime = newtime;
+		scalehrtime(&delta);
+		if (T_ONPROC(t) && t->t_waitrq == 0) {
 			npctcpu = cpu_grow(pctcpu, delta);
 		} else {
-			hrlb = t->t_hrtime;
-			delta = newtime - hrlb;
-			if (delta < 0) {
-				newtime = gethrtime_unscaled();
-				delta = newtime - hrlb;
-			}
-			t->t_hrtime = newtime;
-			scalehrtime(&delta);
-			pctcpu = t->t_pctcpu;
 			npctcpu = cpu_decay(pctcpu, delta);
 		}
 	} while (atomic_cas_32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu);
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
index b555bb82b7..eba6147fab 100644
--- a/usr/src/uts/common/os/pid.c
+++ b/usr/src/uts/common/os/pid.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -112,6 +113,18 @@ pid_lookup(pid_t pid)
 	return (pidp);
 }
 
+struct pid *
+pid_find(pid_t pid)
+{
+	struct pid *pidp;
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pid);
+	mutex_exit(&pidlinklock);
+
+	return (pidp);
+}
+
 void
 pid_setmin(void)
 {
@@ -522,6 +535,20 @@ sprunlock(proc_t *p)
 	THREAD_KPRI_RELEASE();
 }
 
+/*
+ * Undo effects of sprlock but without dropping p->p_lock
+ */
+void
+sprunprlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	THREAD_KPRI_RELEASE();
+}
+
 void
 pid_init(void)
 {
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 07bc2920da..d2bdb4ce37 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +55,7 @@
 #include <sys/mntent.h>
 #include <sys/contract_impl.h>
 #include <sys/dld_ioc.h>
+#include <sys/brand.h>
 
 /*
  * There are two possible layers of privilege routines and two possible
@@ -1243,6 +1244,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner)
 void
 secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
 {
+	proc_t *p = curproc;
+
+	/*
+	 * Allow the brand to override this behaviour.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) {
+		/*
+		 * This brand hook will return 0 if handling is complete, or
+		 * some other value if the brand would like us to fall back to
+		 * the usual behaviour.
+		 */
+		if (BROP(p)->b_setid_clear(vap, cr) == 0) {
+			return;
+		}
+	}
+
 	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
 	    secpolicy_vnode_setid_retain(cr,
 	    (vap->va_mode & S_ISUID) != 0 &&
@@ -2078,6 +2095,13 @@ secpolicy_meminfo(const cred_t *cr)
 }
 
 int
+secpolicy_fs_import(const cred_t *cr)
+{
+	return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL));
+}
+
+
+int
 secpolicy_pfexec_register(const cred_t *cr)
 {
 	return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL));
@@ -2581,3 +2605,11 @@ secpolicy_ppp_config(const cred_t *cr)
 		return (secpolicy_net_config(cr, B_FALSE));
 	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
 }
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+		return (EPERM);
+	return (0);
+}
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index a3cdaccc2a..cc1c5e03a6 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP
 	Allows a process to perform privileged mappings through a
 	graphics device.
 
+privilege PRIV_HYPRLOFS_CONTROL
+
+	Allows a process to manage hyprlofs entries.
+
 privilege PRIV_IPC_DAC_READ
 
 	Allows a process to read a System V IPC
@@ -372,6 +376,10 @@ privilege PRIV_SYS_DEVICES
 	Allows a process to open the real console device directly.
 	Allows a process to open devices that have been exclusively opened.
 
+privilege PRIV_SYS_FS_IMPORT
+
+	Allows a process to import a potentially untrusted file system.
+
 privilege PRIV_SYS_IPC_CONFIG
 
 	Allows a process to increase the size of a System V IPC Message
diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c
index c1d6569f11..15e77d39f7 100644
--- a/usr/src/uts/common/os/sched.c
+++ b/usr/src/uts/common/os/sched.c
@@ -27,6 +27,10 @@
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
@@ -646,16 +650,17 @@ top:
 		klwp_t *lwp = ttolwp(tp);
 
 		/*
-		 * Swapout eligible lwps (specified by the scheduling
-		 * class) which don't have TS_DONT_SWAP set.  Set the
-		 * "intent to swap" flag (TS_SWAPENQ) on threads
-		 * which have TS_DONT_SWAP set so that they can be
+		 * Swapout eligible lwps (specified by the scheduling class)
+		 * which don't have TS_DONT_SWAP set.  Set the "intent to swap"
+		 * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP
+		 * set or are currently on a split stack so that they can be
 		 * swapped if and when they reach a safe point.
 		 */
 		thread_lock(tp);
 		thread_pri = CL_SWAPOUT(tp, swapflags);
 		if (thread_pri != -1) {
-			if (tp->t_schedflag & TS_DONT_SWAP) {
+			if ((tp->t_schedflag & TS_DONT_SWAP) ||
+			    (tp->t_flag & T_SPLITSTK)) {
 				tp->t_schedflag |= TS_SWAPENQ;
 				tp->t_trapret = 1;
 				aston(tp);
diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c
index 453b1f22d4..5ef12f3ae4 100644
--- a/usr/src/uts/common/os/sig.c
+++ b/usr/src/uts/common/os/sig.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -60,6 +60,7 @@
 #include <sys/cyclic.h>
 #include <sys/dtrace.h>
 #include <sys/sdt.h>
+#include <sys/brand.h>
 #include <sys/signalfd.h>
 
 const k_sigset_t nullsmask = {0, 0, 0};
@@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig)
 }
 
 /*
+ * Return true if the signal can safely be ignored.
+ * That is, if the signal is included in the p_ignore mask and doing so is not
+ * forbidden by any process branding.
+ */
+static int
+sig_ignorable(proc_t *p, klwp_t *lwp, int sig)
+{
+	return (sigismember(&p->p_ignore, sig) &&	/* sig in ignore mask */
+	    !(PROC_IS_BRANDED(p) &&			/* allowed by brand */
+	    BROP(p)->b_sig_ignorable != NULL &&
+	    BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE));
+
+}
+
+/*
  * Return true if the signal can safely be discarded on generation.
  * That is, if there is no need for the signal on the receiving end.
  * The answer is true if the process is a zombie or
@@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig)
  *	the signal is not being accepted via sigwait()
  */
 static int
-sig_discardable(proc_t *p, int sig)
+sig_discardable(proc_t *p, kthread_t *tp, int sig)
 {
 	kthread_t *t = p->p_tlist;
+	klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp;
 
 	return (t == NULL ||		/* if zombie or ... */
-	    (sigismember(&p->p_ignore, sig) &&	/* signal is ignored */
+	    (sig_ignorable(p, lwp, sig) &&		/* signal is ignored */
 	    t->t_forw == t &&			/* and single-threaded */
 	    !tracing(p, sig) &&			/* and no /proc tracing */
 	    !signal_is_blocked(t, sig) &&	/* and signal not blocked */
@@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig)
 		    !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) {
 			ttoproc(t)->p_stopsig = 0;
 			t->t_dtrace_stop = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 		} else if (t != curthread && t->t_state == TS_ONPROC) {
 			aston(t);	/* make it do issig promptly */
@@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig)
 		}
 	}
 
-	if (sig_discardable(p, sig)) {
+	if (sig_discardable(p, t, sig)) {
 		DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist,
 		    proc_t *, p, int, sig);
 		return;
@@ -497,7 +514,7 @@ issig_justlooking(void)
 			if (sigismember(&set, sig) &&
 			    (tracing(p, sig) ||
 			    sigismember(&t->t_sigwait, sig) ||
-			    !sigismember(&p->p_ignore, sig))) {
+			    !sig_ignorable(p, lwp, sig))) {
 				/*
 				 * Don't promote a signal that will stop
 				 * the process when lwp_nostop is set.
@@ -623,6 +640,21 @@ issig_forreal(void)
 		}
 
 		/*
+		 * Allow the brand the chance to alter (or suppress) delivery
+		 * of this signal.
+		 */
+		if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) {
+			/*
+			 * The brand hook will return 0 if it would like
+			 * us to drive on, or -1 if we should restart
+			 * the loop to check other conditions.
+			 */
+			if (BROP(p)->b_issig_stop(p, lwp) != 0) {
+				continue;
+			}
+		}
+
+		/*
 		 * Honor requested stop before dealing with the
 		 * current signal; a debugger may change it.
 		 * Do not want to go back to loop here since this is a special
@@ -656,7 +688,7 @@ issig_forreal(void)
 			lwp->lwp_cursig = 0;
 			lwp->lwp_extsig = 0;
 			if (sigismember(&t->t_sigwait, sig) ||
-			    (!sigismember(&p->p_ignore, sig) &&
+			    (!sig_ignorable(p, lwp, sig) &&
 			    !isjobstop(sig))) {
 				if (p->p_flag & (SEXITLWPS|SKILLED)) {
 					sig = SIGKILL;
@@ -708,7 +740,7 @@ issig_forreal(void)
 				toproc = 0;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&t->t_extsig, sig))
 						ext = 1;
 					break;
@@ -722,7 +754,7 @@ issig_forreal(void)
 				toproc = 1;
 				if (tracing(p, sig) ||
 				    sigismember(&t->t_sigwait, sig) ||
-				    !sigismember(&p->p_ignore, sig)) {
+				    !sig_ignorable(p, lwp, sig)) {
 					if (sigismember(&p->p_extsig, sig))
 						ext = 1;
 					break;
@@ -954,6 +986,16 @@ stop(int why, int what)
 		}
 		break;
 
+	case PR_BRAND:
+		/*
+		 * We have been stopped by the brand code for a brand-private
+		 * reason.  This is an asynchronous stop affecting only this
+		 * LWP.
+		 */
+		VERIFY(PROC_IS_BRANDED(p));
+		flags &= ~TS_BSTART;
+		break;
+
 	default:	/* /proc stop */
 		flags &= ~TS_PSTART;
 		/*
@@ -1065,7 +1107,7 @@ stop(int why, int what)
 		}
 	}
 
-	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) {
+	if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) {
 		/*
 		 * Do process-level notification when all lwps are
 		 * either stopped on events of interest to /proc
@@ -1171,6 +1213,13 @@ stop(int why, int what)
 	if (why == PR_CHECKPOINT)
 		del_one_utstop();
 
+	/*
+	 * Allow the brand to post notification of this stop condition.
+	 */
+	if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) {
+		BROP(p)->b_stop_notify(p, lwp, why, what);
+	}
+
 	thread_lock(t);
 	ASSERT((t->t_schedflag & TS_ALLSTART) == 0);
 	t->t_schedflag |= flags;
@@ -1192,7 +1241,7 @@ stop(int why, int what)
 		    (p->p_flag & (SEXITLWPS|SKILLED))) {
 			p->p_stopsig = 0;
 			thread_lock(t);
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			setrun_locked(t);
 			thread_unlock_nopreempt(t);
 		} else if (why == PR_JOBCONTROL) {
@@ -1327,7 +1376,7 @@ psig(void)
 	 * this signal from pending to current (we dropped p->p_lock).
 	 * This can happen only in a multi-threaded process.
 	 */
-	if (sigismember(&p->p_ignore, sig) ||
+	if (sig_ignorable(p, lwp, sig) ||
 	    (func == SIG_DFL && sigismember(&stopdefault, sig))) {
 		lwp->lwp_cursig = 0;
 		lwp->lwp_extsig = 0;
@@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp)
 			/*
 			 * This can only happen when the parent is init.
 			 * (See call to sigcld(q, NULL) in exit().)
-			 * Use KM_NOSLEEP to avoid deadlock.
+			 * Use KM_NOSLEEP to avoid deadlock. The child procs
+			 * initpid can be 1 for zlogin.
 			 */
-			ASSERT(pp == proc_init);
+			ASSERT(pp->p_pidp->pid_id ==
+			    cp->p_zone->zone_proc_initpid ||
+			    pp->p_pidp->pid_id == 1);
 			winfo(cp, &info, 0);
 			sigaddq(pp, NULL, &info, KM_NOSLEEP);
 		} else {
@@ -1804,6 +1856,15 @@ sigcld_repost()
 
 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 	mutex_enter(&pidlock);
+	if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) {
+		/*
+		 * Allow the brand to inject synthetic SIGCLD signals.
+		 */
+		if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) {
+			mutex_exit(&pidlock);
+			return;
+		}
+	}
 	for (cp = pp->p_child; cp; cp = cp->p_sibling) {
 		if (cp->p_pidflag & CLDPEND) {
 			post_sigcld(cp, sqp);
@@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp)
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(sig >= 1 && sig < NSIG);
 
-	if (sig_discardable(p, sig))
+	if (sig_discardable(p, t, sig))
 		siginfofree(sigqp);
 	else
 		sigaddqins(p, t, sigqp);
@@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags)
 	 * blocking the signal (it *could* change it's mind while
 	 * the signal is pending) then don't bother creating one.
 	 */
-	if (!sig_discardable(p, sig) &&
+	if (!sig_discardable(p, t, sig) &&
 	    (sigismember(&p->p_siginfo, sig) ||
 	    (curproc->p_ct_process != p->p_ct_process) ||
 	    (sig == SIGCLD && SI_FROMKERNEL(infop))) &&
diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c
index 6084676b17..6dc7230bed 100644
--- a/usr/src/uts/common/os/smb_subr.c
+++ b/usr/src/uts/common/os/smb_subr.c
@@ -25,7 +25,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/smbios_impl.h>
 #include <sys/cmn_err.h>
@@ -43,13 +45,13 @@ smb_strerror(int err)
 void *
 smb_alloc(size_t len)
 {
-	return (kmem_alloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL);
 }
 
 void *
 smb_zalloc(size_t len)
 {
-	return (kmem_zalloc(len, KM_SLEEP));
+	return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL);
 }
 
 void
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 62f94729cf..0a1406e0cd 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -24,7 +24,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -77,6 +77,7 @@
 #include <sys/policy.h>
 #include <sys/dld.h>
 #include <sys/zone.h>
+#include <sys/limits.h>
 #include <c2/audit.h>
 
 /*
@@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		 * (registered in sd_wakeq).
 		 */
 		struiod_t uiod;
+		struct iovec buf[IOV_MAX_STACK];
+		int iovlen = 0;
 
 		if (first)
 			stp->sd_wakeq &= ~RSLEEP;
 
-		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-		    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+		if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+			iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+			uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP);
+		} else {
+			uiod.d_iov = buf;
+		}
+
+		(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 		uiod.d_mp = 0;
 		/*
 		 * Mark that a thread is in rwnext on the read side
@@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 			if ((bp = uiod.d_mp) != NULL) {
 				*errorp = 0;
 				ASSERT(MUTEX_HELD(&stp->sd_lock));
+				if (iovlen != 0)
+					kmem_free(uiod.d_iov, iovlen);
 				return (bp);
 			}
 			error = 0;
@@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
 		} else {
 			*errorp = error;
 			ASSERT(MUTEX_HELD(&stp->sd_lock));
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (NULL);
 		}
+
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
+
 		/*
 		 * Try a getq in case a rwnext() generated mblk
 		 * has bubbled up via strrput().
@@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
     int b_flag, int pri, int flags)
 {
 	struiod_t uiod;
+	struct iovec buf[IOV_MAX_STACK];
+	int iovlen = 0;
 	mblk_t *mp;
 	queue_t *wqp = stp->sd_wrq;
 	int error = 0;
@@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	mp->b_flag |= b_flag;
 	mp->b_band = (uchar_t)pri;
 
-	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov,
-	    sizeof (uiod.d_iov) / sizeof (*uiod.d_iov));
+	if (uiop->uio_iovcnt > IOV_MAX_STACK) {
+		iovlen = uiop->uio_iovcnt * sizeof (iovec_t);
+		uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP);
+	} else {
+		uiod.d_iov = buf;
+	}
+
+	(void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt);
 	uiod.d_uio.uio_offset = 0;
 	uiod.d_mp = mp;
 	error = rwnext(wqp, &uiod);
 	if (! uiod.d_mp) {
 		uioskip(uiop, *iosize);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	ASSERT(mp == uiod.d_mp);
@@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		error = 0;
 	} else {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	/* Have to check canput before consuming data from the uio */
 	if (pri == 0) {
 		if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	} else {
 		if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) {
 			freemsg(mp);
+			if (iovlen != 0)
+				kmem_free(uiod.d_iov, iovlen);
 			return (EWOULDBLOCK);
 		}
 	}
@@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 	/* Copyin data from the uio */
 	if ((error = struioget(wqp, mp, &uiod, 0)) != 0) {
 		freemsg(mp);
+		if (iovlen != 0)
+			kmem_free(uiod.d_iov, iovlen);
 		return (error);
 	}
 	uioskip(uiop, *iosize);
@@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
 		putnext(wqp, mp);
 		stream_runservice(stp);
 	}
+	if (iovlen != 0)
+		kmem_free(uiod.d_iov, iovlen);
 	return (0);
 }
 
@@ -3178,6 +3215,7 @@ job_control_type(int cmd)
 	case JAGENT:	/* Obsolete */
 	case JTRUN:	/* Obsolete */
 	case JXTPROTO:	/* Obsolete */
+	case TIOCSETLD:
 		return (JCSETP);
 	}
 
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 0d1bb6a8a1..aa44ccf788 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -1093,18 +1093,20 @@ char **syscallnames;
 
 systrace_sysent_t *systrace_sysent;
 void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t,
-    uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
 
 /*ARGSUSED*/
 void
 systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
-    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5,
+    uintptr_t arg6, uintptr_t arg7)
 {}
 
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1112,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5,
+		    arg6, arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1126,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5,
+	    arg6, arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1145,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32;
 /*ARGSUSED*/
 int64_t
 dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
-    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
+    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6,
+    uintptr_t arg7)
 {
 	systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum];
 	dtrace_id_t id;
@@ -1153,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	proc_t *p;
 
 	if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5);
+		(*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+		    arg7);
 
 	/*
 	 * We want to explicitly allow DTrace consumers to stop a process
@@ -1167,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
 	}
 	mutex_exit(&p->p_lock);
 
-	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5);
+	rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6,
+	    arg7);
 
 	if (ttolwp(curthread)->lwp_errno != 0)
 		rval = -1;
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
 		(*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval,
-		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0);
+		    (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0);
 
 	return (rval);
 }
@@ -1202,5 +1209,5 @@ dtrace_systrace_rtt(void)
 	}
 
 	if ((id = sy->stsy_return) != DTRACE_IDNONE)
-		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0);
+		(*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0);
 }
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index a554f8c3f3..0a6fe0ef96 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1618,7 +1618,7 @@ vmem_destroy(vmem_t *vmp)
 
 	leaked = vmem_size(vmp, VMEM_ALLOC);
 	if (leaked != 0)
-		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
 		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
 		    "identifiers" : "bytes");
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 706e5ed16f..072b0038f1 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2013, Joyent Inc. All rights reserved.
+ * Copyright 2015, Joyent Inc. All rights reserved.
  */
 
 /*
@@ -250,6 +250,8 @@
 #include <sys/cpucaps.h>
 #include <vm/seg.h>
 #include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
 
 /*
  * This constant specifies the number of seconds that threads waiting for
@@ -370,8 +372,12 @@ static char *zone_ref_subsys_names[] = {
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;
 rctl_hndl_t rc_zone_shmmax;
@@ -417,8 +423,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
  * Version 5 alters the zone_boot system call, and converts its old
  *     bootargs parameter to be set by the zone_setattr API instead.
  * Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
  */
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
 
 /*
  * Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1377,6 +1384,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+	rcop_no_action,
+	zone_cpu_base_get,
+	zone_cpu_base_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+	rcop_no_action,
+	zone_cpu_burst_time_get,
+	zone_cpu_burst_time_set,
+	rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set priority to the new value.
+	 */
+	zone->zone_zfs_io_pri = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+	rcop_no_action,
+	zone_zfs_io_pri_get,
+	zone_zfs_io_pri_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
 {
 	rctl_qty_t nlwps;
@@ -1671,6 +1786,39 @@ static rctl_ops_t zone_max_swap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_t *z = p->p_zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	/* No additional lock because not enforced in the kernel */
+	q = z->zone_phys_mem;
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	e->rcep_p.zone->zone_phys_mem_ctl = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+	rcop_no_action,
+	zone_phys_mem_usage,
+	zone_phys_mem_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
@@ -1764,6 +1912,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+	zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+	return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1792,7 +1954,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
 }
 
 static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
     int (*updatefunc) (kstat_t *, int))
 {
 	kstat_t *ksp;
@@ -1818,6 +1980,230 @@ zone_kstat_create_common(zone_t *zone, char *name,
 }
 
 static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_vfs_kstat_t *zvp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the VFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the slow ops
+	 * counters are updated directly by the VFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zvp->zv_nread.value.ui64 = kiop->nread;
+	zvp->zv_reads.value.ui64 = kiop->reads;
+	zvp->zv_rtime.value.ui64 = kiop->rtime;
+	zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+	zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+	zvp->zv_writes.value.ui64 = kiop->writes;
+	zvp->zv_wtime.value.ui64 = kiop->wtime;
+	zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_vfs_kstat_t *zvp;
+
+	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_vfs_lock;
+	zone->zone_vfs_stats = zvp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_vfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_zfs_kstat_t *zzp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the ZFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the I/O throttle
+	 * counters are updated directly by the ZFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zzp->zz_nread.value.ui64 = kiop->nread;
+	zzp->zz_reads.value.ui64 = kiop->reads;
+	zzp->zz_rtime.value.ui64 = kiop->rtime;
+	zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+	zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+	zzp->zz_writes.value.ui64 = kiop->writes;
+
+	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_zfs_kstat_t *zzp;
+
+	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_zfs_lock;
+	zone->zone_zfs_stats = zzp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_zfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_mcap_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_mcap_kstat_t *zmp = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+	zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+	zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+	zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
+	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
+	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
+	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
+	zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
+	zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+	zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+	zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
+
+	return (0);
+}
+
+static kstat_t *
+zone_mcap_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_mcap_kstat_t *zmp;
+
+	if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
+	    zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
+	    sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_mcap_lock;
+	zone->zone_mcap_stats = zmp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+	    KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_mcap_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
 zone_misc_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1846,6 +2232,11 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
 	zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
 	zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
 
+	zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
+
+	zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
+	zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
+
 	return (0);
 }
 
@@ -1884,7 +2275,10 @@ zone_misc_kstat_create(zone_t *zone)
 	    KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
 	kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
-
+	kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
 
 	ksp->ks_update = zone_misc_kstat_update;
 	ksp->ks_private = zone;
@@ -1896,13 +2290,30 @@ zone_misc_kstat_create(zone_t *zone)
 static void
 zone_kstat_create(zone_t *zone)
 {
-	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
 	    "lockedmem", zone_lockedmem_kstat_update);
-	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
 	    "swapresv", zone_swapresv_kstat_update);
-	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+	    "physicalmem", zone_physmem_kstat_update);
+	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
 	    "nprocs", zone_nprocs_kstat_update);
 
+	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+		zone->zone_vfs_stats = kmem_zalloc(
+		    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+		zone->zone_zfs_stats = kmem_zalloc(
+		    sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
+		zone->zone_mcap_stats = kmem_zalloc(
+		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
+	}
+
 	if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
 		zone->zone_misc_stats = kmem_zalloc(
 		    sizeof (zone_misc_kstat_t), KM_SLEEP);
@@ -1929,8 +2340,17 @@ zone_kstat_delete(zone_t *zone)
 	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
 	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_physmem_kstat,
+	    sizeof (zone_kstat_t));
 	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
 	    sizeof (zone_kstat_t));
+
+	zone_kstat_delete_common(&zone->zone_vfs_ksp,
+	    sizeof (zone_vfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_zfs_ksp,
+	    sizeof (zone_zfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_mcap_ksp,
+	    sizeof (zone_mcap_kstat_t));
 	zone_kstat_delete_common(&zone->zone_misc_ksp,
 	    sizeof (zone_misc_kstat_t));
 }
@@ -1966,6 +2386,8 @@ zone_zsd_init(void)
 	zone0.zone_locked_mem_ctl = UINT64_MAX;
 	ASSERT(zone0.zone_max_swap == 0);
 	zone0.zone_max_swap_ctl = UINT64_MAX;
+	zone0.zone_phys_mem = 0;
+	zone0.zone_phys_mem_ctl = UINT64_MAX;
 	zone0.zone_max_lofi = 0;
 	zone0.zone_max_lofi_ctl = UINT64_MAX;
 	zone0.zone_shmmax = 0;
@@ -1989,8 +2411,9 @@ zone_zsd_init(void)
 	zone0.zone_initname = initname;
 	zone0.zone_lockedmem_kstat = NULL;
 	zone0.zone_swapresv_kstat = NULL;
+	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
-
+	zone0.zone_zfs_io_pri = 1;
 	zone0.zone_stime = 0;
 	zone0.zone_utime = 0;
 	zone0.zone_wtime = 0;
@@ -2101,6 +2524,21 @@ zone_init(void)
 	    RCTL_GLOBAL_INFINITE,
 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
+	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    16384, 16384, &zone_zfs_io_pri_ops);
+
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2142,6 +2580,20 @@ zone_init(void)
 	rde = rctl_dict_lookup("zone.cpu-shares");
 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
 
+	/*
+	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
+	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+	 */
+	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+	bzero(dval, sizeof (rctl_val_t));
+	dval->rcv_value = 1;
+	dval->rcv_privilege = RCPRIV_PRIVILEGED;
+	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+	dval->rcv_action_recip_pid = -1;
+
+	rde = rctl_dict_lookup("zone.zfs-io-priority");
+	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2152,6 +2604,11 @@ zone_init(void)
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_max_swap_ops);
 
+	rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_phys_mem_ops);
+
 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2173,6 +2630,8 @@ zone_init(void)
 	zone0.zone_ntasks = 1;
 	mutex_exit(&p0.p_lock);
 	zone0.zone_restart_init = B_TRUE;
+	zone0.zone_reboot_on_init_exit = B_FALSE;
+	zone0.zone_init_status = -1;
 	zone0.zone_brand = &native_brand;
 	rctl_prealloc_destroy(gp);
 	/*
@@ -2252,6 +2711,8 @@ zone_init(void)
 static void
 zone_free(zone_t *zone)
 {
+	zone_dl_t *zdl;
+
 	ASSERT(zone != global_zone);
 	ASSERT(zone->zone_ntasks == 0);
 	ASSERT(zone->zone_nlwps == 0);
@@ -2280,6 +2741,19 @@ zone_free(zone_t *zone)
 	list_destroy(&zone->zone_ref_list);
 	zone_free_zsd(zone);
 	zone_free_datasets(zone);
+
+	/*
+	 * While dlmgmtd should have removed all of these, it could have left
+	 * something behind or crashed. In which case it's not safe for us to
+	 * assume that the list is empty which list_destroy() will ASSERT. We
+	 * clean up for our userland comrades which may have crashed, or worse,
+	 * been disabled by SMF.
+	 */
+	while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+		if (zdl->zdl_net != NULL)
+			nvlist_free(zdl->zdl_net);
+		kmem_free(zdl, sizeof (zone_dl_t));
+	}
 	list_destroy(&zone->zone_dl_list);
 
 	if (zone->zone_rootvp != NULL)
@@ -2324,12 +2798,18 @@ zone_free(zone_t *zone)
 static void
 zone_status_set(zone_t *zone, zone_status_t status)
 {
+	timestruc_t now;
+	uint64_t t;
 
 	nvlist_t *nvl = NULL;
 	ASSERT(MUTEX_HELD(&zone_status_lock));
 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
 	    status >= zone_status_get(zone));
 
+	/* Current time since Jan 1 1970 but consumers expect NS */
+	gethrestime(&now);
+	t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
@@ -2337,7 +2817,7 @@ zone_status_set(zone_t *zone, zone_status_t status)
 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
 	    zone_status_table[zone->zone_status]) ||
 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
-	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
 #ifdef DEBUG
@@ -2463,14 +2943,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 	return (0);
 }
 
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats.  Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats.  We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
 {
-	uint64_t mcap;
-	int err = 0;
+	zone->zone_mcap_nover++;
+
+	return (0);
+}
+
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+	uint64_t pageout;
+	int err;
+
+	if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+		zone->zone_mcap_pagedout += pageout;
+
+	return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults.  This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+	uint32_t dusec;
+	int err;
 
-	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-		zone->zone_phys_mcap = mcap;
+	if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+		zone->zone_pg_flt_delay = dusec;
+
+	return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+	uint64_t rss;
+	int err;
+
+	if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+		zone->zone_phys_mem = rss;
 
 	return (err);
 }
@@ -2882,6 +3413,12 @@ getzoneid(void)
 	return (curproc->p_zone->zone_id);
 }
 
+zoneid_t
+getzonedid(void)
+{
+	return (curproc->p_zone->zone_did);
+}
+
 /*
  * Internal versions of zone_find_by_*().  These don't zone_hold() or
  * check the validity of a zone's state.
@@ -3625,6 +4162,17 @@ zone_start_init(void)
 	 */
 	z->zone_proc_initpid = p->p_pid;
 
+	if (z->zone_setup_app_contract == B_TRUE) {
+		/*
+		 * Normally a process cannot modify its own contract, but we're
+		 * just starting the zone's init process and its contract is
+		 * always initialized from the sys_process_tmpl template, so
+		 * this is the simplest way to setup init's contract to kill
+		 * the process if any other process in the contract exits.
+		 */
+		p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+	}
+
 	/*
 	 * We maintain zone_boot_err so that we can return the cause of the
 	 * failure back to the caller of the zone_boot syscall.
@@ -3653,9 +4201,54 @@ zone_start_init(void)
 			lwp_exit();
 		}
 	} else {
+		id_t cid = curthread->t_cid;
+
 		if (zone_status_get(z) == ZONE_IS_BOOTING)
 			zone_status_set(z, ZONE_IS_RUNNING);
 		mutex_exit(&zone_status_lock);
+
+		mutex_enter(&class_lock);
+		ASSERT(cid < loaded_classes);
+		if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+		    z->zone_fixed_hipri) {
+			/*
+			 * If the zone is using FX then by default all
+			 * processes start at the lowest priority and stay
+			 * there. We provide a mechanism for the zone to
+			 * indicate that it should run at "high priority". In
+			 * this case we setup init to run at the highest FX
+			 * priority (which is one level higher than the
+			 * non-fixed scheduling classes can use).
+			 */
+			pcparms_t pcparms;
+
+			pcparms.pc_cid = cid;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+			    FXMAXUPRI;
+			((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+			    FX_DOUPRILIM | FX_DOUPRI;
+
+			mutex_enter(&pidlock);
+			mutex_enter(&curproc->p_lock);
+
+			(void) parmsset(&pcparms, curthread);
+
+			mutex_exit(&curproc->p_lock);
+			mutex_exit(&pidlock);
+		} else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+			/*
+			 * zsched always starts the init lwp at priority
+			 * minclsyspri - 1. This priority gets set in t_pri and
+			 * is invalid for RT, but RT never uses t_pri. However
+			 * t_pri is used by procfs, so we always see processes
+			 * within an RT zone with an invalid priority value.
+			 * We fix that up now.
+			 */
+			curthread->t_pri = RTGPPRIO0;
+		}
+		mutex_exit(&class_lock);
+
 		/* cause the process to return to userland. */
 		lwp_rtt();
 	}
@@ -4139,8 +4732,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
 
 		error = EINVAL;
 		name = nvpair_name(nvp);
-		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
-		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+		if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+		    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+		    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
 			goto out;
 		}
 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4258,7 +4852,7 @@ zone_create(const char *zone_name, const char *zone_root,
     caddr_t rctlbuf, size_t rctlbufsz,
     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
     int match, uint32_t doi, const bslabel_t *label,
-    int flags)
+    int flags, zoneid_t zone_did)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -4281,6 +4875,7 @@ zone_create(const char *zone_name, const char *zone_root,
 
 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
 	zoneid = zone->zone_id = id_alloc(zoneid_space);
+	zone->zone_did = zone_did;
 	zone->zone_status = ZONE_IS_UNINITIALIZED;
 	zone->zone_pool = pool_default;
 	zone->zone_pool_mod = gethrtime();
@@ -4288,6 +4883,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_ncpus = 0;
 	zone->zone_ncpus_online = 0;
 	zone->zone_restart_init = B_TRUE;
+	zone->zone_reboot_on_init_exit = B_FALSE;
+	zone->zone_init_status = -1;
 	zone->zone_brand = &native_brand;
 	zone->zone_initname = NULL;
 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4349,10 +4946,14 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_locked_mem_ctl = UINT64_MAX;
 	zone->zone_max_swap = 0;
 	zone->zone_max_swap_ctl = UINT64_MAX;
+	zone->zone_phys_mem = 0;
+	zone->zone_phys_mem_ctl = UINT64_MAX;
 	zone->zone_max_lofi = 0;
 	zone->zone_max_lofi_ctl = UINT64_MAX;
-	zone0.zone_lockedmem_kstat = NULL;
-	zone0.zone_swapresv_kstat = NULL;
+	zone->zone_lockedmem_kstat = NULL;
+	zone->zone_swapresv_kstat = NULL;
+	zone->zone_physmem_kstat = NULL;
+	zone->zone_zfs_io_pri = 1;
 
 	/*
 	 * Zsched initializes the rctls.
@@ -4509,8 +5110,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	/*
 	 * The process, task, and project rctls are probably wrong;
 	 * we need an interface to get the default values of all rctls,
-	 * and initialize zsched appropriately.  I'm not sure that that
-	 * makes much of a difference, though.
+	 * and initialize zsched appropriately. However, we allow zoneadmd
+	 * to pass down both zone and project rctls for the zone's init.
 	 */
 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
 	if (error != 0) {
@@ -4651,6 +5252,7 @@ zone_boot(zoneid_t zoneid)
 static int
 zone_empty(zone_t *zone)
 {
+	int cnt = 0;
 	int waitstatus;
 
 	/*
@@ -4661,7 +5263,16 @@ zone_empty(zone_t *zone)
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 	while ((waitstatus = zone_status_timedwait_sig(zone,
 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
-		killall(zone->zone_id);
+		boolean_t force = B_FALSE;
+
+		/* Every 30 seconds, try harder */
+		if (cnt++ >= 30) {
+			cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+			    zone->zone_id);
+			force = B_TRUE;
+			cnt = 0;
+		}
+		killall(zone->zone_id, force);
 	}
 	/*
 	 * return EINTR if we were signaled
@@ -5412,14 +6023,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		size = sizeof (zone->zone_phys_mcap);
-		if (bufsize > size)
-			bufsize = size;
-		if (buf != NULL &&
-		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-			error = EFAULT;
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		mutex_enter(&class_lock);
 
@@ -5474,6 +6077,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		}
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_DID:
+		size = sizeof (zoneid_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		size = sizeof (boolean_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+		    bufsize) != 0)
+			error = EFAULT;
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -5505,10 +6125,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-	 * global zone.
+	 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+	 * attributes can be set on the global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+	if (zoneid == GLOBAL_ZONEID &&
+	    attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
 		return (set_errno(EINVAL));
 	}
 
@@ -5525,7 +6146,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	 * non-global zones.
 	 */
 	zone_status = zone_status_get(zone);
-	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+	if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+	    attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+	    zone_status > ZONE_IS_READY) {
 		err = EINVAL;
 		goto done;
 	}
@@ -5547,8 +6170,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_FS_ALLOWED:
 		err = zone_set_fs_allowed(zone, (const char *)buf);
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+	case ZONE_ATTR_PMCAP_NOVER:
+		err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PMCAP_PAGEOUT:
+		err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PG_FLT_DELAY:
+		err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+		break;
+	case ZONE_ATTR_RSS:
+		err = zone_set_rss(zone, (const uint64_t *)buf);
 		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		err = zone_set_sched_class(zone, (const char *)buf);
@@ -5577,6 +6209,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		err = zone_set_network(zoneid, zbuf);
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_APP_SVC_CT:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_setup_app_contract = (boolean_t)buf;
+			err = 0;
+		}
+		break;
+	case ZONE_ATTR_SCHED_FIXEDHI:
+		if (bufsize != sizeof (boolean_t)) {
+			err = EINVAL;
+		} else {
+			zone->zone_fixed_hipri = (boolean_t)buf;
+			err = 0;
+		}
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6269,6 +6917,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 			zs.doi = zs32.doi;
 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
 			zs.flags = zs32.flags;
+			zs.zoneid = zs32.zoneid;
 #else
 			panic("get_udatamodel() returned bogus result\n");
 #endif
@@ -6279,7 +6928,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
 		    zs.extended_error, zs.match, zs.doi,
-		    zs.label, zs.flags));
+		    zs.label, zs.flags, zs.zoneid));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
 	case ZONE_DESTROY:
@@ -6380,6 +7029,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
 	bcopy(zone->zone_name, zone_name, zone_namelen);
 	zoneid = zone->zone_id;
 	uniqid = zone->zone_uniqid;
+	arg.status = zone->zone_init_status;
 	/*
 	 * zoneadmd may be down, but at least we can empty out the zone.
 	 * We can ignore the return value of zone_empty() since we're called
@@ -6557,7 +7207,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
 	 * later.
 	 */
-	killall(zone->zone_id);
+	killall(zone->zone_id, B_FALSE);
 	/*
 	 * Now, create the thread to contact zoneadmd and do the rest of the
 	 * work.  This thread can't be created in our zone otherwise