10 files changed, 3667 insertions, 558 deletions
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile
index 8324f7fefa..e81e4631aa 100644
--- a/usr/src/cmd/zoneadmd/Makefile
+++ b/usr/src/cmd/zoneadmd/Makefile
@@ -18,57 +18,54 @@
 #
 # CDDL HEADER END
 
-
-#
-
 #
 # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
 #
 
 PROG= zoneadmd
 
 include ../Makefile.cmd
+include ../Makefile.ctf
 
-ROOTCMDDIR=	$(ROOTLIB)/zones
-
-OBJS= zoneadmd.o zcons.o vplat.o
-SRCS = $(OBJS:.o=.c)
-POFILE=zoneadmd_all.po
-POFILES= $(OBJS:%.o=%.po)
+$(64ONLY)SUBDIRS= $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
 
-CFLAGS += $(CCVERBOSE)
-CERRWARN += -_gcc=-Wno-switch
-CERRWARN += -_gcc=-Wno-parentheses
-CERRWARN += -_gcc=-Wno-uninitialized
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
 
-LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
-	-lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \
-	-linetutil -lscf
 XGETFLAGS += -a -x zoneadmd.xcl
 
+ROOTUSRLIBZONES			= $(ROOT)/usr/lib/zones
+
 .KEEP_STATE:
 
 .PARALLEL:
 
-all: $(PROG)
+all: $(SUBDIRS)
 
 $(PROG): $(OBJS)
 	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
 	$(POST_PROCESS)
 
-install: all $(ROOTCMD)
-
-$(POFILE): $(POFILES)
-	$(RM) $@
-	$(CAT) $(POFILES) > $@
+install: $(SUBDIRS)
+	-$(RM) $(ROOTUSRLIBZONES)/$(PROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRLIBZONES)/$(PROG)
 
-clean:
-	$(RM) $(OBJS)
+$(POFILE):
 
-lint:	lint_SRCS
+clean clobebr lint:	$(SUBDIRS)
 
 check:
-	$(CSTYLE) -p -P $(SRCS:%=%)
+	$(CSTYLE) -p -P *.c
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
 
 include ../Makefile.targ
diff --git a/usr/src/cmd/zoneadmd/Makefile.com b/usr/src/cmd/zoneadmd/Makefile.com
new file mode 100644
index 0000000000..c8becc3e8c
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/Makefile.com
@@ -0,0 +1,70 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2014, Joyent, Inc. All rights reserved.
+#
+
+PROG= zoneadmd
+
+include ../../Makefile.cmd
+include ../../Makefile.ctf
+
+ROOTCMDDIR=	$(ROOTLIB)/zones
+
+OBJS= zoneadmd.o zcons.o zfd.o vplat.o mcap.o
+
+CFLAGS += $(CCVERBOSE)
+LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
+	-lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \
+	-linetutil -lproc -lscf
+
+.KEEP_STATE:
+
+%.o:    ../%.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+ROOTUSRLIBZONES			= $(ROOT)/usr/lib/zones
+ROOTUSRLIBZONES32		= $(ROOTUSRLIBZONES)/$(MACH32)
+ROOTUSRLIBZONES64		= $(ROOTUSRLIBZONES)/$(MACH64)
+ROOTUSRLIBZONESPROG32		= $(ROOTUSRLIBZONES32)/$(PROG)
+ROOTUSRLIBZONESPROG64		= $(ROOTUSRLIBZONES64)/$(PROG)
+$(ROOTUSRLIBZONES32)/%: $(ROOTUSRLIBZONES32) %
+	$(INS.file)
+$(ROOTUSRLIBZONES64)/%: $(ROOTUSRLIBZONES64) %
+	$(INS.file)
+$(ROOTUSRLIBZONES32):
+	$(INS.dir)
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+clean:
+	$(RM) $(OBJS)
+
+lint:
+	$(LINT.c) ../*.c $(LDLIBS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/zoneadmd/amd64/Makefile b/usr/src/cmd/zoneadmd/amd64/Makefile
new file mode 100644
index 0000000000..75ac51db32
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/amd64/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+#
+
+.KEEP_STATE:
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRLIBZONES64) $(ROOTUSRLIBZONESPROG64)
diff --git a/usr/src/cmd/zoneadmd/i386/Makefile b/usr/src/cmd/zoneadmd/i386/Makefile
new file mode 100644
index 0000000000..a8764e0638
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/i386/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+#
+
+.KEEP_STATE:
+
+include ../Makefile.com
+
+install: all $(ROOTUSRLIBZONES32) $(ROOTUSRLIBZONESPROG32)
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
new file mode 100644
index 0000000000..16cd2dd07a
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -0,0 +1,1182 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2014, Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * This file implements the code which runs a thread inside zoneadmd to cap
+ * the associated zone's physical memory.  A thread to do this is started
+ * when the zone boots and is halted when the zone shuts down.
+ *
+ * Because of the way that the VM system is currently implemented, there is no
+ * way to go from the bottom up (page to process to zone).  Thus, there is no
+ * obvious way to hook an rctl into the kernel's paging code to enforce a hard
+ * memory cap.  Instead, we implement a soft physical memory cap which looks
+ * at the zone's overall rss and once it is over the cap, works from the top
+ * down (zone to process to page), looking at zone processes, to determine
+ * what to try to pageout to get the zone under its memory cap.
+ *
+ * The code uses the fast, cheap, but potentially very inaccurate sum of the
+ * rss values from psinfo_t to first approximate the zone's rss and will
+ * fallback to the vm_getusage syscall to determine the zone's rss if needed.
+ * It then checks the rss against the zone's zone.max-physical-memory rctl.
+ * Once the zone goes over its cap, then this thread will work through the
+ * zone's /proc process list, Pgrab-bing each process and stepping through the
+ * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
+ * to pageout pages, until the zone is again under its cap.
+ *
+ * Although zone memory capping is implemented as a soft cap by this user-level
+ * thread, the interfaces around memory caps that are exposed to the user are
+ * the standard ones; an rctl and kstats.  This thread uses the rctl value
+ * to obtain the cap and works with the zone kernel code to update the kstats.
+ * If the implementation ever moves into the kernel, these exposed interfaces
+ * do not need to change.
+ *
+ * The thread adaptively sleeps, periodically checking the state of the
+ * zone.  As the zone's rss gets closer to the cap, the thread will wake up
+ * more often to check the zone's status.  Once the zone is over the cap,
+ * the thread will work to pageout until the zone is under the cap, as shown
+ * by updated vm_usage data.
+ *
+ * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
+ * are set by hrm_setbits() and on x86 that code path is only executed by
+ *     segvn_pagelock -> hat_setstat -> hrm_setbits
+ *     segvn_softunlock -^
+ * On SPARC there is an additional code path which may make this data
+ * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
+ * maps.  If we ever fix this issue, then we could generalize this mcap code to
+ * do more with the data on active pages.
+ *
+ * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
+ * cause the thread to start logging its actions into that file (it may take
+ * a minute or two if the thread is currently sleeping).  Removing that
+ * file will cause logging to stop.
+ */
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libproc.h>
+#include <limits.h>
+#include <procfs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/priocntl.h>
+#include <dirent.h>
+#include <zone.h>
+#include <libzonecfg.h>
+#include <thread.h>
+#include <values.h>
+#include <sys/vm_usage.h>
+#include <sys/resource.h>
+#include <sys/debug.h>
+#include <synch.h>
+#include <wait.h>
+#include <libcontract.h>
+#include <libcontract_priv.h>
+#include <sys/contract/process.h>
+#include "zoneadmd.h"
+
+					/* round up to next y = 2^n */
+#define	ROUNDUP(x, y)	(((x) + ((y) - 1)) & ~((y) - 1))
+
+#define	CAP_REFRESH	((uint64_t)300 * NANOSEC) /* every 5 minutes */
+
+/*
+ * zonecfg attribute tunables for memory capping.
+ *    phys-mcap-cmd
+ *	type: string
+ *	specifies a command that can be run when over the cap
+ *    phys-mcap-no-vmusage
+ *	type: boolean
+ *	true disables vm_getusage and just uses zone's proc. rss sum
+ *    phys-mcap-no-pageout
+ *	type: boolean
+ *	true disables pageout when over
+ *    phys-mcap-no-pf-throttle
+ *	type: boolean
+ *	true disables page fault throttling when over
+ */
+#define	TUNE_CMD	"phys-mcap-cmd"
+#define	TUNE_NVMU	"phys-mcap-no-vmusage"
+#define	TUNE_NPAGE	"phys-mcap-no-pageout"
+#define	TUNE_NPFTHROT	"phys-mcap-no-pf-throttle"
+
+/*
+ * These are only used in get_mem_info but global. We always need scale_rss and
+ * prev_fast_rss to be persistent but we also have the other two global so we
+ * can easily see these with mdb.
+ */
+uint64_t	scale_rss = 0;
+uint64_t	prev_fast_rss = 0;
+uint64_t	fast_rss = 0;
+uint64_t	accurate_rss = 0;
+
+static char	zoneproc[MAXPATHLEN];
+static char	debug_log[MAXPATHLEN];
+static zoneid_t	zid;
+static mutex_t	shutdown_mx;
+static cond_t	shutdown_cv;
+static int	shutting_down = 0;
+static thread_t mcap_tid;
+static FILE	*debug_log_fp = NULL;
+static uint64_t zone_rss_cap;		/* RSS cap(KB) */
+static char	over_cmd[2 * BUFSIZ];	/* same size as zone_attr_value */
+static boolean_t skip_vmusage = B_FALSE;
+static boolean_t skip_pageout = B_FALSE;
+static boolean_t skip_pf_throttle = B_FALSE;
+
+static zlog_t	*logp;
+
+static int64_t check_suspend();
+static void get_mcap_tunables();
+
+/*
+ * Structure to hold current state about a process address space that we're
+ * working on.
+ */
+typedef struct {
+	int pr_curr;		/* the # of the mapping we're working on */
+	int pr_nmap;		/* number of mappings in address space */
+	prmap_t *pr_mapp;	/* process's map array */
+} proc_map_t;
+
+typedef struct zsd_vmusage64 {
+	id_t vmu_zoneid;
+	uint_t vmu_type;
+	id_t vmu_id;
+	/*
+	 * An amd64 kernel will align the following uint64_t members, but a
+	 * 32bit i386 process will not without help.
+	 */
+	int vmu_align_next_members_on_8_bytes;
+	uint64_t vmu_rss_all;
+	uint64_t vmu_rss_private;
+	uint64_t vmu_rss_shared;
+	uint64_t vmu_swap_all;
+	uint64_t vmu_swap_private;
+	uint64_t vmu_swap_shared;
+} zsd_vmusage64_t;
+
+/*
+ * Output a debug log message.
+ */
+/*PRINTFLIKE1*/
+static void
+debug(char *fmt, ...)
+{
+	va_list ap;
+
+	if (debug_log_fp == NULL)
+		return;
+
+	va_start(ap, fmt);
+	(void) vfprintf(debug_log_fp, fmt, ap);
+	va_end(ap);
+	(void) fflush(debug_log_fp);
+}
+
+/*
+ * Like sleep(3C) but can be interupted by cond_signal which is posted when
+ * we're shutting down the mcap thread.
+ */
+static void
+sleep_shutdown(int secs)
+{
+	timestruc_t to;
+
+	to.tv_sec = secs;
+	to.tv_nsec = 0;
+
+	(void) mutex_lock(&shutdown_mx);
+	if (!shutting_down)
+		(void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
+	(void) mutex_unlock(&shutdown_mx);
+}
+
+static boolean_t
+proc_issystem(pid_t pid)
+{
+	char pc_clname[PC_CLNMSZ];
+
+	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
+	    PC_KY_NULL) != -1)
+		return (strcmp(pc_clname, "SYS") == 0);
+
+	return (B_TRUE);
+}
+
+/*
+ * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
+ */
+static void
+run_over_cmd()
+{
+	int		ctfd;
+	int		err;
+	pid_t		childpid;
+	siginfo_t	info;
+	ctid_t		ct;
+
+	/*
+	 * Before we enter the zone, we need to create a new process contract
+	 * for the child, as required by zone_enter().
+	 */
+	if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
+		return;
+	if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
+	    ct_tmpl_set_informative(ctfd, 0) != 0 ||
+	    ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
+	    ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
+	    ct_tmpl_activate(ctfd) != 0) {
+		(void) close(ctfd);
+		return;
+	}
+
+	childpid = fork();
+	switch (childpid) {
+	case -1:
+		(void) ct_tmpl_clear(ctfd);
+		(void) close(ctfd);
+		break;
+	case 0:	/* Child */
+		(void) ct_tmpl_clear(ctfd);
+		(void) close(ctfd);
+		if (zone_enter(zid) == -1)
+			_exit(errno);
+		err = system(over_cmd);
+		_exit(err);
+		break;
+	default:	/* Parent */
+		if (contract_latest(&ct) == -1)
+			ct = -1;
+		(void) ct_tmpl_clear(ctfd);
+		(void) close(ctfd);
+		err = waitid(P_PID, childpid, &info, WEXITED);
+		(void) contract_abandon_id(ct);
+		if (err == -1 || info.si_status != 0)
+			debug("over_cmd failed");
+		break;
+	}
+}
+
+/*
+ * Get the next mapping.
+ */
+static prmap_t *
+nextmapping(proc_map_t *pmp)
+{
+	if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
+		return (NULL);
+
+	return (&pmp->pr_mapp[pmp->pr_curr++]);
+}
+
+/*
+ * Initialize the proc_map_t to access the first mapping of an address space.
+ */
+static prmap_t *
+init_map(proc_map_t *pmp, pid_t pid)
+{
+	int fd;
+	int res;
+	struct stat st;
+	char pathbuf[MAXPATHLEN];
+
+	bzero(pmp, sizeof (proc_map_t));
+	pmp->pr_nmap = -1;
+
+	(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
+	if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
+		return (NULL);
+
+redo:
+	errno = 0;
+	if (fstat(fd, &st) != 0)
+		goto done;
+
+	if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
+		debug("cannot malloc() %ld bytes for xmap", st.st_size);
+		goto done;
+	}
+	(void) bzero(pmp->pr_mapp, st.st_size);
+
+	errno = 0;
+	if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
+		free(pmp->pr_mapp);
+		pmp->pr_mapp = NULL;
+		if (res > 0 || errno == E2BIG) {
+			goto redo;
+		} else {
+			debug("pid %ld cannot read xmap\n", pid);
+			goto done;
+		}
+	}
+
+	pmp->pr_nmap = st.st_size / sizeof (prmap_t);
+
+done:
+	(void) close(fd);
+	return (nextmapping(pmp));
+}
+
+/*
+ * Attempt to invalidate the entire mapping from within the given process's
+ * address space. May return nonzero with errno as:
+ *    ESRCH  - process not found
+ *    ENOMEM - segment not found
+ *    EINVAL - mapping exceeds a single segment
+ */
+static int
+pageout_mapping(pid_t pid, prmap_t *pmp)
+{
+	int res;
+
+	if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
+		return (0);
+
+	errno = 0;
+	res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
+	    pmp->pr_size);
+
+	return (res);
+}
+
+/*
+ * Work through a process paging out mappings until the whole address space was
+ * examined or the excess is < 0.  Return our estimate of the updated excess.
+ */
+static int64_t
+pageout_process(pid_t pid, int64_t excess)
+{
+	int			psfd;
+	prmap_t			*pmap;
+	proc_map_t		cur;
+	int			res;
+	int64_t			sum_d_rss, d_rss;
+	int64_t			old_rss;
+	int			map_cnt;
+	psinfo_t		psinfo;
+	char			pathbuf[MAXPATHLEN];
+
+	(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
+	    pid);
+	if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
+		return (excess);
+
+	cur.pr_mapp = NULL;
+
+	if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
+		goto done;
+
+	old_rss = (int64_t)psinfo.pr_rssize;
+	map_cnt = 0;
+
+	/* If unscannable, skip it. */
+	if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
+		debug("pid %ld: system process, skipping %s\n",
+		    pid, psinfo.pr_psargs);
+		goto done;
+	}
+
+	/* If tiny RSS (16KB), skip it. */
+	if (old_rss <= 16) {
+		debug("pid %ld: skipping, RSS %lldKB %s\n",
+		    pid, old_rss, psinfo.pr_psargs);
+		goto done;
+	}
+
+	/* Get segment residency information. */
+	pmap = init_map(&cur, pid);
+
+	/* Skip process if it has no mappings. */
+	if (pmap == NULL) {
+		debug("pid %ld: map unreadable; ignoring\n", pid);
+		goto done;
+	}
+
+	debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
+	    pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
+
+	/*
+	 * Within the process's address space, attempt to page out mappings.
+	 */
+	sum_d_rss = 0;
+	while (excess > 0 && pmap != NULL && !shutting_down) {
+		/* invalidate the entire mapping */
+		if ((res = pageout_mapping(pid, pmap)) < 0)
+			debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
+			    pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
+
+		map_cnt++;
+
+		/*
+		 * Re-check the process rss and get the delta.
+		 */
+		if (pread(psfd, &psinfo, sizeof (psinfo), 0)
+		    != sizeof (psinfo)) {
+			excess -= old_rss;
+			goto done;
+		}
+
+		d_rss = (int64_t)psinfo.pr_rssize - old_rss;
+		old_rss = (int64_t)psinfo.pr_rssize;
+		sum_d_rss += d_rss;
+
+		/*
+		 * d_rss hopefully should be negative (or 0 if nothing
+		 * invalidated) but can be positive if more got paged in.
+		 */
+		excess += d_rss;
+
+		if (excess <= 0) {
+			debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
+			    "excess %lldKB\n", pid, map_cnt,
+			    (unsigned long long)sum_d_rss, (long long)excess);
+			map_cnt = 0;
+
+			/*
+			 * If we're actually under, this will suspend checking
+			 * in the middle of this process's address space.
+			 */
+			excess = check_suspend();
+			if (shutting_down)
+				goto done;
+
+			/*
+			 * since we might have suspended, re-read process's rss
+			 */
+			if (pread(psfd, &psinfo, sizeof (psinfo), 0)
+			    != sizeof (psinfo)) {
+				excess -= old_rss;
+				goto done;
+			}
+
+			old_rss = (int64_t)psinfo.pr_rssize;
+
+			debug("pid %ld: resume pageout; excess %lld\n", pid,
+			    (long long)excess);
+			sum_d_rss = 0;
+		}
+
+		pmap = nextmapping(&cur);
+	}
+
+	debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
+	    pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
+
+done:
+	if (cur.pr_mapp != NULL)
+		free(cur.pr_mapp);
+
+	(void) close(psfd);
+
+	if (shutting_down)
+		return (0);
+
+	return (excess);
+}
+
+/*
+ * Get the zone's RSS data.
+ */
+static uint64_t
+get_mem_info()
+{
+	uint64_t		n = 1;
+	zsd_vmusage64_t		buf;
+	uint64_t		tmp_rss;
+	DIR			*pdir = NULL;
+	struct dirent		*dent;
+
+	/*
+	 * Start by doing the fast, cheap RSS calculation using the rss value
+	 * in psinfo_t.  Because that's per-process, it can lead to double
+	 * counting some memory and overestimating how much is being used, but
+	 * as long as that's not over the cap, then we don't need do the
+	 * expensive calculation.
+	 *
+	 * If we have to do the expensive calculation, we remember the scaling
+	 * factor so that we can try to use that on subsequent iterations for
+	 * the fast rss.
+	 */
+	if (shutting_down)
+		return (0);
+
+	if ((pdir = opendir(zoneproc)) == NULL)
+		return (0);
+
+	accurate_rss = 0;
+	fast_rss = 0;
+	while (!shutting_down && (dent = readdir(pdir)) != NULL) {
+		pid_t		pid;
+		int		psfd;
+		int64_t		rss;
+		char		pathbuf[MAXPATHLEN];
+		psinfo_t	psinfo;
+
+		if (strcmp(".", dent->d_name) == 0 ||
+		    strcmp("..", dent->d_name) == 0)
+			continue;
+
+		pid = atoi(dent->d_name);
+		if (pid == 0 || pid == 1)
+			continue;
+
+		(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
+		    zoneproc, pid);
+
+		rss = 0;
+		if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
+			if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
+			    sizeof (psinfo))
+				rss = (int64_t)psinfo.pr_rssize;
+
+			(void) close(psfd);
+		}
+
+		fast_rss += rss;
+	}
+
+	(void) closedir(pdir);
+
+	if (shutting_down)
+		return (0);
+
+	debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
+	    scale_rss, prev_fast_rss);
+
+	/* see if we can get by with a scaled fast rss */
+	tmp_rss = fast_rss;
+	if (scale_rss > 1 && prev_fast_rss > 0) {
+		/*
+		 * Only scale the fast value if it hasn't ballooned too much
+		 * to trust.
+		 */
+		if (fast_rss / prev_fast_rss < 2) {
+			fast_rss /= scale_rss;
+			debug("scaled fast rss: %lluKB\n", fast_rss);
+		}
+	}
+
+	if (fast_rss <= zone_rss_cap || skip_vmusage) {
+		uint64_t zone_rss_bytes;
+
+		zone_rss_bytes = fast_rss * 1024;
+		/* Use the zone's approx. RSS in the kernel */
+		(void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
+		return (fast_rss);
+	}
+
+	buf.vmu_id = zid;
+
+	/* get accurate usage (cached data may be up to 5 seconds old) */
+	if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
+	    (uintptr_t)&buf, (uintptr_t)&n) != 0) {
+		debug("vmusage failed\n");
+		(void) sleep_shutdown(1);
+		return (0);
+	}
+
+	if (n > 1) {
+		/* This should never happen */
+		debug("vmusage returned more than one result\n");
+		(void) sleep_shutdown(1);
+		return (0);
+	}
+
+	if (buf.vmu_id != zid) {
+		/* This should never happen */
+		debug("vmusage returned the incorrect zone\n");
+		(void) sleep_shutdown(1);
+		return (0);
+	}
+
+	accurate_rss = buf.vmu_rss_all / 1024;
+
+	/* calculate scaling factor to use for fast_rss from now on */
+	if (accurate_rss > 0) {
+		scale_rss = fast_rss / accurate_rss;
+		debug("new scaling factor: %llu\n", scale_rss);
+		/* remember the fast rss when we had to get the accurate rss */
+		prev_fast_rss = tmp_rss;
+	}
+
+	debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
+	    scale_rss, prev_fast_rss);
+	return (accurate_rss);
+}
+
+/*
+ * Needed to read the zones physical-memory-cap rctl.
+ */
+static struct ps_prochandle *
+grab_zone_proc()
+{
+	DIR *dirp;
+	struct dirent *dentp;
+	struct ps_prochandle *ph = NULL;
+	int tmp;
+
+	if ((dirp = opendir(zoneproc)) == NULL)
+		return (NULL);
+
+	while (!shutting_down && (dentp = readdir(dirp))) {
+		int pid;
+
+		if (strcmp(".", dentp->d_name) == 0 ||
+		    strcmp("..", dentp->d_name) == 0)
+			continue;
+
+		pid = atoi(dentp->d_name);
+		/* attempt to grab process */
+		if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
+			if (Psetflags(ph, PR_RLC) == 0) {
+				if (Pcreate_agent(ph) == 0) {
+					(void) closedir(dirp);
+					return (ph);
+				}
+			}
+			Prelease(ph, 0);
+		}
+	}
+
+	(void) closedir(dirp);
+	return (NULL);
+}
+
+static uint64_t
+get_zone_cap()
+{
+	rctlblk_t *rblk;
+	uint64_t mcap;
+	struct ps_prochandle *ph;
+
+	if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
+		return (UINT64_MAX);
+
+	if ((ph = grab_zone_proc()) == NULL) {
+		free(rblk);
+		return (UINT64_MAX);
+	}
+
+	if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
+	    RCTL_FIRST)) {
+		Pdestroy_agent(ph);
+		Prelease(ph, 0);
+		free(rblk);
+		return (UINT64_MAX);
+	}
+
+	Pdestroy_agent(ph);
+	Prelease(ph, 0);
+
+	mcap = rctlblk_get_value(rblk);
+	free(rblk);
+	return (mcap);
+}
+
+/*
+ * check_suspend is invoked at the beginning of every pass through the process
+ * list or after we've paged out enough so that we think the excess is under
+ * the cap.  The purpose is to periodically check the zone's rss and return
+ * the excess when the zone is over the cap.  The rest of the time this
+ * function will sleep, periodically waking up to check the current rss.
+ *
+ * Depending on the percentage of penetration of the zone's rss into the
+ * cap we sleep for longer or shorter amounts. This reduces the impact of this
+ * work on the system, which is important considering that each zone will be
+ * monitoring its rss.
+ */
+static int64_t
+check_suspend()
+{
+	static hrtime_t last_cap_read = 0;
+	static uint64_t addon;
+	static uint64_t lo_thresh;	/* Thresholds for how long to  sleep */
+	static uint64_t hi_thresh;	/* when under the cap (80% & 90%). */
+	static uint64_t prev_zone_rss = 0;
+	static uint32_t pfdelay = 0;	/* usec page fault delay when over */
+
+	/* Wait a second to give the async pageout a chance to catch up. */
+	(void) sleep_shutdown(1);
+
+	while (!shutting_down) {
+		int64_t new_excess;
+		int sleep_time;
+		hrtime_t now;
+		struct stat st;
+		uint64_t zone_rss;		/* total RSS(KB) */
+
+		/*
+		 * Check if the debug log files exists and enable or disable
+		 * debug.
+		 */
+		if (debug_log_fp == NULL) {
+			if (stat(debug_log, &st) == 0)
+				debug_log_fp = fopen(debug_log, "w");
+		} else {
+			if (stat(debug_log, &st) == -1) {
+				(void) fclose(debug_log_fp);
+				debug_log_fp = NULL;
+			}
+		}
+
+		/*
+		 * If the CAP_REFRESH interval has passed, re-get the current
+		 * cap in case it has been dynamically updated.
+		 */
+		now = gethrtime();
+		if (now - last_cap_read > CAP_REFRESH) {
+			uint64_t mcap;
+
+			last_cap_read = now;
+
+			mcap = get_zone_cap();
+			if (mcap != 0 && mcap != UINT64_MAX)
+				zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
+			else
+				zone_rss_cap = UINT64_MAX;
+
+			lo_thresh = (uint64_t)(zone_rss_cap * .8);
+			hi_thresh = (uint64_t)(zone_rss_cap * .9);
+			addon = (uint64_t)(zone_rss_cap * 0.05);
+
+			/*
+			 * We allow the memory cap tunables to be changed on
+			 * the fly.
+			 */
+			get_mcap_tunables();
+
+			debug("%s: %s\n", TUNE_CMD, over_cmd);
+			debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
+			debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
+			debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
+			debug("current cap %lluKB lo %lluKB hi %lluKB\n",
+			    zone_rss_cap, lo_thresh, hi_thresh);
+		}
+
+		/* No cap, nothing to do. */
+		if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
+			debug("no cap, sleep 120 seconds\n");
+			(void) sleep_shutdown(120);
+			continue;
+		}
+
+		zone_rss = get_mem_info();
+
+		/* calculate excess */
+		new_excess = zone_rss - zone_rss_cap;
+
+		debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
+		    zone_rss, zone_rss_cap, new_excess);
+
+		/*
+		 * If necessary, updates stats.
+		 */
+
+		/*
+		 * If it looks like we did some paging out since last over the
+		 * cap then update the kstat so we can approximate how much was
+		 * paged out.
+		 */
+		if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
+			uint64_t diff;
+
+			/* assume diff is num bytes we paged out */
+			diff = (prev_zone_rss - zone_rss) * 1024;
+
+			(void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+			    &diff, 0);
+		}
+		prev_zone_rss = zone_rss;
+
+		if (new_excess > 0) {
+			uint64_t n = 1;
+
+			/* Increment "nover" kstat. */
+			(void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+
+			if (!skip_pf_throttle) {
+				/*
+				 * Tell the kernel to start throttling page
+				 * faults by some number of usecs to help us
+				 * catch up. If we are persistently over the
+				 * cap the delay ramps up to a max of 2000usecs.
+				 * Note that for delays less than 1 tick
+				 * (i.e. all of these) we busy-wait in as_fault.
+				 *	delay	faults/sec
+				 *	 125	8000
+				 *	 250	4000
+				 *	 500	2000
+				 *	1000	1000
+				 *	2000	 500
+				 */
+				if (pfdelay == 0)
+					pfdelay = 125;
+				else if (pfdelay < 2000)
+					pfdelay *= 2;
+
+				(void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
+				    &pfdelay, 0);
+			}
+
+			/*
+			 * Once we go over the cap, then we want to
+			 * page out a little extra instead of stopping
+			 * right at the cap. To do this we add 5% to
+			 * the excess so that pageout_proces will work
+			 * a little longer before stopping.
+			 */
+			return ((int64_t)(new_excess + addon));
+		}
+
+		/*
+		 * At this point we are under the cap.
+		 *
+		 * Tell the kernel to stop throttling page faults.
+		 *
+		 * Scale the amount of time we sleep before rechecking the
+		 * zone's memory usage.  Also, scale the accpetable age of
+		 * cached results from vm_getusage.  We do this based on the
+		 * penetration into the capped limit.
+		 */
+		if (pfdelay > 0) {
+			pfdelay = 0;
+			(void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
+			    &pfdelay, 0);
+		}
+
+		if (zone_rss <= lo_thresh) {
+			sleep_time = 120;
+		} else if (zone_rss <= hi_thresh) {
+			sleep_time = 60;
+		} else {
+			sleep_time = 30;
+		}
+
+		debug("sleep %d seconds\n", sleep_time);
+		(void) sleep_shutdown(sleep_time);
+	}
+
+	/* Shutting down, tell the kernel so it doesn't throttle */
+	if (pfdelay > 0) {
+		pfdelay = 0;
+		(void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
+	}
+
+	return (0);
+}
+
+static void
+get_mcap_tunables()
+{
+	zone_dochandle_t handle;
+	struct zone_attrtab attr;
+
+	over_cmd[0] = '\0';
+	if ((handle = zonecfg_init_handle()) == NULL)
+		return;
+
+	if (zonecfg_get_handle(zone_name, handle) != Z_OK)
+		goto done;
+
+	/* Reset to defaults in case rebooting and settings have changed */
+	over_cmd[0] = '\0';
+	skip_vmusage = B_FALSE;
+	skip_pageout = B_FALSE;
+	skip_pf_throttle = B_FALSE;
+
+	if (zonecfg_setattrent(handle) != Z_OK)
+		goto done;
+	while (zonecfg_getattrent(handle, &attr) == Z_OK) {
+		if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
+			(void) strlcpy(over_cmd, attr.zone_attr_value,
+			    sizeof (over_cmd));
+		} else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
+			if (strcmp("true", attr.zone_attr_value) == 0)
+				skip_vmusage = B_TRUE;
+		} else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
+			if (strcmp("true", attr.zone_attr_value) == 0)
+				skip_pageout = B_TRUE;
+		} else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
+			if (strcmp("true", attr.zone_attr_value) == 0)
+				skip_pf_throttle = B_TRUE;
+		}
+	}
+	(void) zonecfg_endattrent(handle);
+
+done:
+	zonecfg_fini_handle(handle);
+}
+
+/* ARGSUSED */
+static int
+chk_proc_fs(void *data, const char *spec, const char *dir,
+    const char *fstype, const char *opt)
+{
+	if (fstype != NULL && strcmp(fstype, "proc") == 0)
+		*((boolean_t *)data) = B_TRUE;
+
+	return (0);
+}
+
+static boolean_t
+has_proc()
+{
+	brand_handle_t bh;
+	boolean_t fnd = B_FALSE;
+
+	if ((bh = brand_open(brand_name)) != NULL) {
+		(void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
+	}
+
+	brand_close(bh);
+	return (fnd);
+}
+
+/*
+ * We run this loop for brands with no /proc to simply update the RSS, using
+ * the cheap GZ /proc data, every 5 minutes.
+ */
+static void
+no_procfs()
+{
+	DIR			*pdir = NULL;
+	struct dirent		*dent;
+	uint64_t		zone_rss_bytes;
+
+	(void) sleep_shutdown(30);
+	while (!shutting_down) {
+		/*
+		 * Just do the fast, cheap RSS calculation using the rss value
+		 * in psinfo_t.  Because that's per-process, it can lead to
+		 * double counting some memory and overestimating how much is
+		 * being used. Since there is no /proc in the zone, we use the
+		 * GZ /proc and check for the correct zone.
+		 */
+		if ((pdir = opendir("/proc")) == NULL)
+			return;
+
+		fast_rss = 0;
+		while (!shutting_down && (dent = readdir(pdir)) != NULL) {
+			pid_t		pid;
+			int		psfd;
+			int64_t		rss;
+			char		pathbuf[MAXPATHLEN];
+			psinfo_t	psinfo;
+
+			if (strcmp(".", dent->d_name) == 0 ||
+			    strcmp("..", dent->d_name) == 0)
+				continue;
+
+			pid = atoi(dent->d_name);
+			if (pid == 0 || pid == 1)
+				continue;
+
+			(void) snprintf(pathbuf, sizeof (pathbuf),
+			    "/proc/%d/psinfo", pid);
+
+			rss = 0;
+			if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
+				if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
+				    sizeof (psinfo)) {
+					if (psinfo.pr_zoneid == zid)
+						rss = (int64_t)psinfo.pr_rssize;
+				}
+
+				(void) close(psfd);
+			}
+
+			fast_rss += rss;
+		}
+
+		(void) closedir(pdir);
+
+		if (shutting_down)
+			return;
+
+		zone_rss_bytes = fast_rss * 1024;
+		/* Use the zone's approx. RSS in the kernel */
+		(void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
+
+		(void) sleep_shutdown(300);
+	}
+}
+
+/*
+ * Thread that checks zone's memory usage and when over the cap, goes through
+ * the zone's process list trying to pageout processes to get under the cap.
+ */
+static void
+mcap_zone()
+{
+	DIR *pdir = NULL;
+	int64_t excess;
+
+	debug("thread startup\n");
+
+	get_mcap_tunables();
+
+	/*
+	 * If the zone has no /proc filesystem, we can't use the fast algorithm
+	 * to check RSS or pageout any processes. All we can do is periodically
+	 * update it's RSS kstat using the expensive sycall.
+	 */
+	if (!has_proc()) {
+		no_procfs();
+		debug("thread shutdown\n");
+		return;
+	}
+
+	/*
+	 * When first starting it is likely lots of other zones are starting
+	 * too because the system is booting.  Since we just started the zone
+	 * we're not worried about being over the cap right away, so we let
+	 * things settle a bit and tolerate some older data here to minimize
+	 * the load on the system.
+	 */
+	(void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
+
+	/* Wait until zone's /proc is mounted */
+	while (!shutting_down) {
+		struct stat st;
+
+		if (stat(zoneproc, &st) == 0 &&
+		    strcmp(st.st_fstype, "proc") == 0)
+			break;
+		sleep_shutdown(5);
+	}
+
+	/* Open zone's /proc and walk entries. */
+	while (!shutting_down) {
+		if ((pdir = opendir(zoneproc)) != NULL)
+			break;
+		sleep_shutdown(5);
+	}
+
+	while (!shutting_down) {
+		struct dirent *dirent;
+
+		/* Wait until we've gone over the cap. */
+		excess = check_suspend();
+
+		debug("starting to scan, excess %lldk\n", (long long)excess);
+
+		if (over_cmd[0] != '\0') {
+			uint64_t zone_rss;	/* total RSS(KB) */
+
+			debug("run phys_mcap_cmd: %s\n", over_cmd);
+			run_over_cmd();
+
+			zone_rss = get_mem_info();
+			excess = zone_rss - zone_rss_cap;
+			debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
+			    zone_rss, zone_rss_cap, excess);
+			if (excess <= 0)
+				continue;
+		}
+
+		while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
+			pid_t pid;
+
+			if (strcmp(".", dirent->d_name) == 0 ||
+			    strcmp("..", dirent->d_name) == 0)
+				continue;
+
+			pid = atoi(dirent->d_name);
+			if (pid == 0 || pid == 1)
+				continue;
+
+			if (skip_pageout)
+				(void) sleep_shutdown(2);
+			else
+				excess = pageout_process(pid, excess);
+
+			if (excess <= 0) {
+				debug("apparently under; excess %lld\n",
+				    (long long)excess);
+				/* Double check the current excess */
+				excess = check_suspend();
+			}
+		}
+
+		debug("process pass done; excess %lld\n", (long long)excess);
+		rewinddir(pdir);
+
+		if (skip_pageout)
+			(void) sleep_shutdown(120);
+	}
+
+	if (pdir != NULL)
+		(void) closedir(pdir);
+	debug("thread shutdown\n");
+}
+
+void
+create_mcap_thread(zlog_t *zlogp, zoneid_t id)
+{
+	int		res;
+
+	shutting_down = 0;
+	zid = id;
+	logp = zlogp;
+
+	/* all but the lx brand currently use /proc */
+	if (strcmp(brand_name, "lx") == 0) {
+		(void) snprintf(zoneproc, sizeof (zoneproc),
+		    "%s/root/native/proc", zonepath);
+	} else {
+		(void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
+		    zonepath);
+	}
+
+	(void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
+	    zonepath);
+
+	res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
+	    &mcap_tid);
+	if (res != 0) {
+		zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
+		    res);
+		mcap_tid = 0;
+	}
+}
+
+void
+destroy_mcap_thread()
+{
+	if (mcap_tid != 0) {
+		shutting_down = 1;
+		(void) cond_signal(&shutdown_cv);
+		(void) thr_join(mcap_tid, NULL, NULL);
+		mcap_tid = 0;
+	}
+}
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index 9d32485fad..cdfa7a8785 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
  * Copyright (c) 2015 by Delphix. All rights reserved.
  */
 
@@ -138,6 +138,9 @@
 
 #define	ALT_MOUNT(mount_cmd) 	((mount_cmd) != Z_MNT_BOOT)
 
+/* Number of times to retry unmounting if it fails */
+#define	UMOUNT_RETRIES	30
+
 /* a reasonable estimate for the number of lwps per process */
 #define	LWPS_PER_PROCESS	10
 
@@ -161,11 +164,25 @@ static priv_set_t *zprivs = NULL;
 
 static const char *DFLT_FS_ALLOWED = "hsfs,smbfs,nfs,nfs3,nfs4,nfsdyn";
 
+typedef struct zone_proj_rctl_map {
+	char *zpr_zone_rctl;
+	char *zpr_project_rctl;
+} zone_proj_rctl_map_t;
+
+static zone_proj_rctl_map_t zone_proj_rctl_map[] = {
+	{"zone.max-msg-ids",	"project.max-msg-ids"},
+	{"zone.max-sem-ids",	"project.max-sem-ids"},
+	{"zone.max-shm-ids",	"project.max-shm-ids"},
+	{"zone.max-shm-memory",	"project.max-shm-memory"},
+	{NULL,			NULL}
+};
+
 /* from libsocket, not in any header file */
 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
 
 /* from zoneadmd */
 extern char query_hook[];
+extern char post_statechg_hook[];
 
 /*
  * For each "net" resource configured in zonecfg, we track a zone_addr_list_t
@@ -202,7 +219,7 @@ autofs_cleanup(zoneid_t zoneid)
 	/*
 	 * Ask autofs to unmount all trigger nodes in the given zone.
 	 */
-	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
+	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)((uintptr_t)zoneid)));
 }
 
 static void
@@ -593,6 +610,24 @@ root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
 }
 
 /*
+ * Perform brand-specific cleanup if we are unable to unmount a FS.
+ */
+static void
+brand_umount_cleanup(zlog_t *zlogp, char *path)
+{
+	char cmdbuf[2 * MAXPATHLEN];
+
+	if (post_statechg_hook[0] == '\0')
+		return;
+
+	if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook,
+	    ZONE_STATE_DOWN, Z_UNMOUNT, path) > sizeof (cmdbuf))
+		return;
+
+	(void) do_subproc(zlogp, cmdbuf, NULL, B_FALSE);
+}
+
+/*
  * The general strategy for unmounting filesystems is as follows:
  *
  * - Remote filesystems may be dead, and attempting to contact them as
@@ -625,6 +660,7 @@ static int
 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
 {
 	int error = 0;
+	int fail = 0;
 	FILE *mnttab;
 	struct mnttab *mnts;
 	uint_t nmnt;
@@ -712,18 +748,39 @@ unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
 				if (umount2(path, MS_FORCE) == 0) {
 					unmounted = B_TRUE;
 					stuck = B_FALSE;
+					fail = 0;
 				} else {
 					/*
-					 * The first failure indicates a
-					 * mount we won't be able to get
-					 * rid of automatically, so we
-					 * bail.
+					 * We may hit a failure here if there
+					 * is an app in the GZ with an open
+					 * pipe into the zone (commonly into
+					 * the zone's /var/run).  This type
+					 * of app will notice the closed
+					 * connection and cleanup, but it may
+					 * take a while and we have no easy
+					 * way to notice that.  To deal with
+					 * this case, we will wait and retry
+					 * a few times before we give up.
 					 */
-					error++;
-					zerror(zlogp, B_FALSE,
-					    "unable to unmount '%s'", path);
-					free_mnttable(mnts, nmnt);
-					goto out;
+					fail++;
+					if (fail < (UMOUNT_RETRIES - 1)) {
+						zerror(zlogp, B_FALSE,
+						    "unable to unmount '%s', "
+						    "retrying in 2 seconds",
+						    path);
+						(void) sleep(2);
+					} else if (fail > UMOUNT_RETRIES) {
+						error++;
+						zerror(zlogp, B_FALSE,
+						    "unmount of '%s' failed",
+						    path);
+						free_mnttable(mnts, nmnt);
+						goto out;
+					} else {
+						/* Try the hook 2 times */
+						brand_umount_cleanup(zlogp,
+						    path);
+					}
 				}
 			}
 			/*
@@ -1061,23 +1118,10 @@ mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
 int
 vplat_get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
 {
-	zone_dochandle_t handle;
-
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-	if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
+	if (zonecfg_get_iptype(snap_hndl, iptypep) != Z_OK) {
 		zerror(zlogp, B_FALSE, "invalid ip-type configuration");
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
-	zonecfg_fini_handle(handle);
 	return (0);
 }
 
@@ -1090,14 +1134,13 @@ static int
 mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
 {
 	char			brand[MAXNAMELEN];
-	zone_dochandle_t	handle = NULL;
 	brand_handle_t		bh = NULL;
 	struct zone_devtab	ztab;
 	di_prof_t		prof = NULL;
 	int			err;
 	int			retval = -1;
 	zone_iptype_t		iptype;
-	const char 		*curr_iptype;
+	const char 		*curr_iptype = NULL;
 
 	if (di_prof_init(devpath, &prof)) {
 		zerror(zlogp, B_TRUE, "failed to initialize profile");
@@ -1132,6 +1175,8 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
 		curr_iptype = "exclusive";
 		break;
 	}
+	if (curr_iptype == NULL)
+		abort();
 
 	if (brand_platform_iter_devices(bh, zone_name,
 	    mount_one_dev_device_cb, prof, curr_iptype) != 0) {
@@ -1146,28 +1191,19 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
 	}
 
 	/* Add user-specified devices and directories */
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_FALSE, "can't initialize zone handle");
-		goto cleanup;
-	}
-	if (err = zonecfg_get_handle(zone_name, handle)) {
-		zerror(zlogp, B_FALSE, "can't get handle for zone "
-		    "%s: %s", zone_name, zonecfg_strerror(err));
-		goto cleanup;
-	}
-	if (err = zonecfg_setdevent(handle)) {
+	if ((err = zonecfg_setdevent(snap_hndl)) != 0) {
 		zerror(zlogp, B_FALSE, "%s: %s", zone_name,
 		    zonecfg_strerror(err));
 		goto cleanup;
 	}
-	while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
+	while (zonecfg_getdevent(snap_hndl, &ztab) == Z_OK) {
 		if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
 			zerror(zlogp, B_TRUE, "failed to add "
 			    "user-specified device");
 			goto cleanup;
 		}
 	}
-	(void) zonecfg_enddevent(handle);
+	(void) zonecfg_enddevent(snap_hndl);
 
 	/* Send profile to kernel */
 	if (di_prof_commit(prof)) {
@@ -1180,8 +1216,6 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
 cleanup:
 	if (bh != NULL)
 		brand_close(bh);
-	if (handle != NULL)
-		zonecfg_fini_handle(handle);
 	if (prof)
 		di_prof_fini(prof);
 	return (retval);
@@ -1671,12 +1705,10 @@ static int
 mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 {
 	char rootpath[MAXPATHLEN];
-	char zonepath[MAXPATHLEN];
 	char brand[MAXNAMELEN];
 	char luroot[MAXPATHLEN];
 	int i, num_fs = 0;
 	struct zone_fstab *fs_ptr = NULL;
-	zone_dochandle_t handle = NULL;
 	zone_state_t zstate;
 	brand_handle_t bh;
 	plat_gmount_cb_data_t cb;
@@ -1690,22 +1722,12 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 		goto bad;
 	}
 
-	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
-		zerror(zlogp, B_TRUE, "unable to determine zone path");
-		goto bad;
-	}
-
 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
 		zerror(zlogp, B_TRUE, "unable to determine zone root");
 		goto bad;
 	}
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		goto bad;
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
-	    zonecfg_setfsent(handle) != Z_OK) {
+	if (zonecfg_setfsent(snap_hndl) != Z_OK) {
 		zerror(zlogp, B_FALSE, "invalid configuration");
 		goto bad;
 	}
@@ -1723,7 +1745,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	/* Get a handle to the brand info for this zone */
 	if ((bh = brand_open(brand)) == NULL) {
 		zerror(zlogp, B_FALSE, "unable to determine zone brand");
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
 
@@ -1734,11 +1755,10 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	cb.pgcd_zlogp = zlogp;
 	cb.pgcd_fs_tab = &fs_ptr;
 	cb.pgcd_num_fs = &num_fs;
-	if (brand_platform_iter_gmounts(bh, zonepath,
+	if (brand_platform_iter_gmounts(bh, zone_name, zonepath,
 	    plat_gmount_cb, &cb) != 0) {
 		zerror(zlogp, B_FALSE, "unable to mount filesystems");
 		brand_close(bh);
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
 	brand_close(bh);
@@ -1749,13 +1769,10 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	 * higher level directories (e.g., /usr) get mounted before
 	 * any beneath them (e.g., /usr/local).
 	 */
-	if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
+	if (mount_filesystems_fsent(snap_hndl, zlogp, &fs_ptr, &num_fs,
 	    mount_cmd) != 0)
 		goto bad;
 
-	zonecfg_fini_handle(handle);
-	handle = NULL;
-
 	/*
 	 * Normally when we mount a zone all the zone filesystems
 	 * get mounted relative to rootpath, which is usually
@@ -1834,8 +1851,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	return (0);
 
 bad:
-	if (handle != NULL)
-		zonecfg_fini_handle(handle);
 	free_fs_data(fs_ptr, num_fs);
 	return (-1);
 }
@@ -2191,13 +2206,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
 	if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
 		/*
 		 * Here, we know that the interface can't be brought up.
-		 * A similar warning message was already printed out to
-		 * the console by zoneadm(1M) so instead we log the
-		 * message to syslog and continue.
 		 */
-		zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
-		    "'%s' which may not be present/plumbed in the "
-		    "global zone.", lifr.lifr_name);
 		(void) close(s);
 		return (Z_OK);
 	}
@@ -2410,7 +2419,6 @@ bad:
 static int
 configure_shared_network_interfaces(zlog_t *zlogp)
 {
-	zone_dochandle_t handle;
 	struct zone_nwiftab nwiftab, loopback_iftab;
 	zoneid_t zoneid;
 
@@ -2419,29 +2427,19 @@ configure_shared_network_interfaces(zlog_t *zlogp)
 		return (-1);
 	}
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-	if (zonecfg_setnwifent(handle) == Z_OK) {
+	if (zonecfg_setnwifent(snap_hndl) == Z_OK) {
 		for (;;) {
-			if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
+			if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK)
 				break;
+			nwifent_free_attrs(&nwiftab);
 			if (configure_one_interface(zlogp, zoneid, &nwiftab) !=
 			    Z_OK) {
-				(void) zonecfg_endnwifent(handle);
-				zonecfg_fini_handle(handle);
+				(void) zonecfg_endnwifent(snap_hndl);
 				return (-1);
 			}
 		}
-		(void) zonecfg_endnwifent(handle);
+		(void) zonecfg_endnwifent(snap_hndl);
 	}
-	zonecfg_fini_handle(handle);
 	if (is_system_labeled()) {
 		/*
 		 * Labeled zones share the loopback interface
@@ -2895,7 +2893,6 @@ free_ip_interface(zone_addr_list_t *zalist)
 static int
 configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
 {
-	zone_dochandle_t handle;
 	struct zone_nwiftab nwiftab;
 	char rootpath[MAXPATHLEN];
 	char path[MAXPATHLEN];
@@ -2904,30 +2901,18 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
 	boolean_t added = B_FALSE;
 	zone_addr_list_t *zalist = NULL, *new;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-
-	if (zonecfg_setnwifent(handle) != Z_OK) {
-		zonecfg_fini_handle(handle);
+	if (zonecfg_setnwifent(snap_hndl) != Z_OK)
 		return (0);
-	}
 
 	for (;;) {
-		if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
+		if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK)
 			break;
 
+		nwifent_free_attrs(&nwiftab);
 		if (prof == NULL) {
 			if (zone_get_devroot(zone_name, rootpath,
 			    sizeof (rootpath)) != Z_OK) {
-				(void) zonecfg_endnwifent(handle);
-				zonecfg_fini_handle(handle);
+				(void) zonecfg_endnwifent(snap_hndl);
 				zerror(zlogp, B_TRUE,
 				    "unable to determine dev root");
 				return (-1);
@@ -2935,8 +2920,7 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
 			(void) snprintf(path, sizeof (path), "%s%s", rootpath,
 			    "/dev");
 			if (di_prof_init(path, &prof) != 0) {
-				(void) zonecfg_endnwifent(handle);
-				zonecfg_fini_handle(handle);
+				(void) zonecfg_endnwifent(snap_hndl);
 				zerror(zlogp, B_TRUE,
 				    "failed to initialize profile");
 				return (-1);
@@ -2960,17 +2944,17 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
 		    nwiftab.zone_nwif_physical) == 0) {
 			added = B_TRUE;
 		} else {
-			(void) zonecfg_endnwifent(handle);
-			zonecfg_fini_handle(handle);
-			zerror(zlogp, B_TRUE, "failed to add network device");
-			return (-1);
+			/*
+			 * Failed to add network device, but the brand hook
+			 * might be doing this for us, so keep silent.
+			 */
+			continue;
 		}
 		/* set up the new IP interface, and add them all later */
 		new = malloc(sizeof (*new));
 		if (new == NULL) {
 			zerror(zlogp, B_TRUE, "no memory for %s",
 			    nwiftab.zone_nwif_physical);
-			zonecfg_fini_handle(handle);
 			free_ip_interface(zalist);
 		}
 		bzero(new, sizeof (*new));
@@ -2980,16 +2964,14 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
 	}
 	if (zalist != NULL) {
 		if ((errno = add_net(zlogp, zoneid, zalist)) != 0) {
-			(void) zonecfg_endnwifent(handle);
-			zonecfg_fini_handle(handle);
+			(void) zonecfg_endnwifent(snap_hndl);
 			zerror(zlogp, B_TRUE, "failed to add address");
 			free_ip_interface(zalist);
 			return (-1);
 		}
 		free_ip_interface(zalist);
 	}
-	(void) zonecfg_endnwifent(handle);
-	zonecfg_fini_handle(handle);
+	(void) zonecfg_endnwifent(snap_hndl);
 
 	if (prof != NULL && added) {
 		if (di_prof_commit(prof) != 0) {
@@ -3125,48 +3107,23 @@ remove_datalink_protect(zlog_t *zlogp, zoneid_t zoneid)
 			/* datalink does not belong to the GZ */
 			continue;
 		}
-		if (dlstatus != DLADM_STATUS_OK) {
+		if (dlstatus != DLADM_STATUS_OK)
 			zerror(zlogp, B_FALSE,
+			    "clear 'protection' link property: %s",
 			    dladm_status2str(dlstatus, dlerr));
-			free(dllinks);
-			return (-1);
-		}
+
 		dlstatus = dladm_set_linkprop(dld_handle, *dllink,
 		    "allowed-ips", NULL, 0, DLADM_OPT_ACTIVE);
-		if (dlstatus != DLADM_STATUS_OK) {
+		if (dlstatus != DLADM_STATUS_OK)
 			zerror(zlogp, B_FALSE,
+			    "clear 'allowed-ips' link property: %s",
 			    dladm_status2str(dlstatus, dlerr));
-			free(dllinks);
-			return (-1);
-		}
 	}
 	free(dllinks);
 	return (0);
 }
 
 static int
-unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
-{
-	int dlnum = 0;
-
-	/*
-	 * The kernel shutdown callback for the dls module should have removed
-	 * all datalinks from this zone.  If any remain, then there's a
-	 * problem.
-	 */
-	if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
-		zerror(zlogp, B_TRUE, "unable to list network interfaces");
-		return (-1);
-	}
-	if (dlnum != 0) {
-		zerror(zlogp, B_FALSE,
-		    "datalinks remain in zone after shutdown");
-		return (-1);
-	}
-	return (0);
-}
-
-static int
 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
 {
@@ -3248,26 +3205,14 @@ static int
 get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
 {
 	int error = -1;
-	zone_dochandle_t handle;
 	char *privname = NULL;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-
 	if (ALT_MOUNT(mount_cmd)) {
 		zone_iptype_t	iptype;
-		const char	*curr_iptype;
+		const char	*curr_iptype = NULL;
 
-		if (zonecfg_get_iptype(handle, &iptype) != Z_OK) {
+		if (zonecfg_get_iptype(snap_hndl, &iptype) != Z_OK) {
 			zerror(zlogp, B_TRUE, "unable to determine ip-type");
-			zonecfg_fini_handle(handle);
 			return (-1);
 		}
 
@@ -3280,17 +3225,15 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
 			break;
 		}
 
-		if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) {
-			zonecfg_fini_handle(handle);
+		if (zonecfg_default_privset(privs, curr_iptype) == Z_OK)
 			return (0);
-		}
+
 		zerror(zlogp, B_FALSE,
 		    "failed to determine the zone's default privilege set");
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
 
-	switch (zonecfg_get_privset(handle, privs, &privname)) {
+	switch (zonecfg_get_privset(snap_hndl, privs, &privname)) {
 	case Z_OK:
 		error = 0;
 		break;
@@ -3313,10 +3256,22 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
 	}
 
 	free(privname);
-	zonecfg_fini_handle(handle);
 	return (error);
 }
 
+static char *
+zone_proj_rctl(const char *name)
+{
+	int i;
+
+	for (i = 0; zone_proj_rctl_map[i].zpr_zone_rctl != NULL; i++) {
+		if (strcmp(name, zone_proj_rctl_map[i].zpr_zone_rctl) == 0) {
+			return (zone_proj_rctl_map[i].zpr_project_rctl);
+		}
+	}
+	return (NULL);
+}
+
 static int
 get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 {
@@ -3326,25 +3281,15 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 	nvlist_t **nvlv = NULL;
 	int rctlcount = 0;
 	int error = -1;
-	zone_dochandle_t handle;
 	struct zone_rctltab rctltab;
 	rctlblk_t *rctlblk = NULL;
 	uint64_t maxlwps;
 	uint64_t maxprocs;
+	int rproc, rlwp;
 
 	*bufp = NULL;
 	*bufsizep = 0;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-
 	rctltab.zone_rctl_valptr = NULL;
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
 		zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
@@ -3353,22 +3298,31 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 
 	/*
 	 * Allow the administrator to control both the maximum number of
-	 * process table slots and the maximum number of lwps with just the
-	 * max-processes property.  If only the max-processes property is set,
-	 * we add a max-lwps property with a limit derived from max-processes.
+	 * process table slots, and the maximum number of lwps, with a single
+	 * max-processes or max-lwps property. If only the max-processes
+	 * property is set, we add a max-lwps property with a limit derived
+	 * from max-processes. If only the max-lwps property is set, we add a
+	 * max-processes property with the same limit as max-lwps.
 	 */
-	if (zonecfg_get_aliased_rctl(handle, ALIAS_MAXPROCS, &maxprocs)
-	    == Z_OK &&
-	    zonecfg_get_aliased_rctl(handle, ALIAS_MAXLWPS, &maxlwps)
-	    == Z_NO_ENTRY) {
-		if (zonecfg_set_aliased_rctl(handle, ALIAS_MAXLWPS,
+	rproc = zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXPROCS, &maxprocs);
+	rlwp = zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXLWPS, &maxlwps);
+	if (rproc == Z_OK && rlwp == Z_NO_ENTRY) {
+		if (zonecfg_set_aliased_rctl(snap_hndl, ALIAS_MAXLWPS,
 		    maxprocs * LWPS_PER_PROCESS) != Z_OK) {
 			zerror(zlogp, B_FALSE, "unable to set max-lwps alias");
 			goto out;
 		}
+	} else if (rlwp == Z_OK && rproc == Z_NO_ENTRY) {
+		/* no scaling for max-proc value */
+		if (zonecfg_set_aliased_rctl(snap_hndl, ALIAS_MAXPROCS,
+		    maxlwps) != Z_OK) {
+			zerror(zlogp, B_FALSE,
+			    "unable to set max-processes alias");
+			goto out;
+		}
 	}
 
-	if (zonecfg_setrctlent(handle) != Z_OK) {
+	if (zonecfg_setrctlent(snap_hndl) != Z_OK) {
 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
 		goto out;
 	}
@@ -3377,10 +3331,11 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 		zerror(zlogp, B_TRUE, "memory allocation failed");
 		goto out;
 	}
-	while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
+	while (zonecfg_getrctlent(snap_hndl, &rctltab) == Z_OK) {
 		struct zone_rctlvaltab *rctlval;
 		uint_t i, count;
 		const char *name = rctltab.zone_rctl_name;
+		char *proj_nm;
 
 		/* zoneadm should have already warned about unknown rctls. */
 		if (!zonecfg_is_rctl(name)) {
@@ -3447,6 +3402,26 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 		}
 		zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
 		rctltab.zone_rctl_valptr = NULL;
+
+		/*
+		 * With no action on our part we will start zsched with the
+		 * project rctl values for our (zoneadmd) current project. For
+		 * brands running a variant of Illumos, that's not a problem
+		 * since they will setup their own projects, but for a
+		 * non-native brand like lx, where there are no projects, we
+		 * want to start things up with the same project rctls as the
+		 * corresponding zone rctls, since nothing within the zone will
+		 * ever change the project rctls.
+		 */
+		if ((proj_nm = zone_proj_rctl(name)) != NULL) {
+			if (nvlist_add_nvlist_array(nvl, proj_nm, nvlv, count)
+			    != 0) {
+				zerror(zlogp, B_FALSE,
+				    "nvlist_add_nvlist_arrays failed");
+				goto out;
+			}
+		}
+
 		if (nvlist_add_nvlist_array(nvl, (char *)name, nvlv, count)
 		    != 0) {
 			zerror(zlogp, B_FALSE, "%s failed",
@@ -3459,7 +3434,7 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 		nvlv = NULL;
 		rctlcount++;
 	}
-	(void) zonecfg_endrctlent(handle);
+	(void) zonecfg_endrctlent(snap_hndl);
 
 	if (rctlcount == 0) {
 		error = 0;
@@ -3483,8 +3458,6 @@ out:
 	nvlist_free(nvl);
 	if (nvlv != NULL)
 		free(nvlv);
-	if (handle != NULL)
-		zonecfg_fini_handle(handle);
 	return (error);
 }
 
@@ -3500,7 +3473,7 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr)
 	    > sizeof (cmdbuf))
 		return (-1);
 
-	if (do_subproc(zlogp, cmdbuf, retstr) != 0)
+	if (do_subproc(zlogp, cmdbuf, retstr, B_FALSE) != 0)
 		return (-1);
 
 	return (0);
@@ -3509,7 +3482,6 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr)
 static int
 get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 {
-	zone_dochandle_t handle;
 	struct zone_dstab dstab;
 	size_t total, offset, len;
 	int error = -1;
@@ -3520,30 +3492,20 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 	*bufp = NULL;
 	*bufsizep = 0;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-
 	if (get_implicit_datasets(zlogp, &implicit_datasets) != 0) {
 		zerror(zlogp, B_FALSE, "getting implicit datasets failed");
 		goto out;
 	}
 
-	if (zonecfg_setdsent(handle) != Z_OK) {
+	if (zonecfg_setdsent(snap_hndl) != Z_OK) {
 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
 		goto out;
 	}
 
 	total = 0;
-	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
+	while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK)
 		total += strlen(dstab.zone_dataset_name) + 1;
-	(void) zonecfg_enddsent(handle);
+	(void) zonecfg_enddsent(snap_hndl);
 
 	if (implicit_datasets != NULL)
 		implicit_len = strlen(implicit_datasets);
@@ -3560,12 +3522,12 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 		goto out;
 	}
 
-	if (zonecfg_setdsent(handle) != Z_OK) {
+	if (zonecfg_setdsent(snap_hndl) != Z_OK) {
 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
 		goto out;
 	}
 	offset = 0;
-	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+	while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) {
 		len = strlen(dstab.zone_dataset_name);
 		(void) strlcpy(str + offset, dstab.zone_dataset_name,
 		    total - offset);
@@ -3573,7 +3535,7 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 		if (offset < total - 1)
 			str[offset++] = ',';
 	}
-	(void) zonecfg_enddsent(handle);
+	(void) zonecfg_enddsent(snap_hndl);
 
 	if (implicit_len > 0)
 		(void) strlcpy(str + offset, implicit_datasets, total - offset);
@@ -3585,8 +3547,6 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
 out:
 	if (error != 0 && str != NULL)
 		free(str);
-	if (handle != NULL)
-		zonecfg_fini_handle(handle);
 	if (implicit_datasets != NULL)
 		free(implicit_datasets);
 
@@ -3596,40 +3556,26 @@ out:
 static int
 validate_datasets(zlog_t *zlogp)
 {
-	zone_dochandle_t handle;
 	struct zone_dstab dstab;
 	zfs_handle_t *zhp;
 	libzfs_handle_t *hdl;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (-1);
-	}
-	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (-1);
-	}
-
-	if (zonecfg_setdsent(handle) != Z_OK) {
+	if (zonecfg_setdsent(snap_hndl) != Z_OK) {
 		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
 
 	if ((hdl = libzfs_init()) == NULL) {
 		zerror(zlogp, B_FALSE, "opening ZFS library");
-		zonecfg_fini_handle(handle);
 		return (-1);
 	}
 
-	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+	while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) {
 
 		if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
 		    ZFS_TYPE_FILESYSTEM)) == NULL) {
 			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
 			    dstab.zone_dataset_name);
-			zonecfg_fini_handle(handle);
 			libzfs_fini(hdl);
 			return (-1);
 		}
@@ -3644,7 +3590,6 @@ validate_datasets(zlog_t *zlogp)
 			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
 			    "property for ZFS dataset '%s'\n",
 			    dstab.zone_dataset_name);
-			zonecfg_fini_handle(handle);
 			zfs_close(zhp);
 			libzfs_fini(hdl);
 			return (-1);
@@ -3652,9 +3597,8 @@ validate_datasets(zlog_t *zlogp)
 
 		zfs_close(zhp);
 	}
-	(void) zonecfg_enddsent(handle);
+	(void) zonecfg_enddsent(snap_hndl);
 
-	zonecfg_fini_handle(handle);
 	libzfs_fini(hdl);
 
 	return (0);
@@ -3708,17 +3652,11 @@ validate_rootds_label(zlog_t *zlogp, char *rootpath, m_label_t *zone_sl)
 	zfs_handle_t	*zhp;
 	libzfs_handle_t	*hdl;
 	m_label_t	ds_sl;
-	char		zonepath[MAXPATHLEN];
 	char		ds_hexsl[MAXNAMELEN];
 
 	if (!is_system_labeled())
 		return (0);
 
-	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
-		zerror(zlogp, B_TRUE, "unable to determine zone path");
-		return (-1);
-	}
-
 	if (!is_zonepath_zfs(zonepath))
 		return (0);
 
@@ -4389,62 +4327,52 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
 }
 
 /*
- * Set memory cap and pool info for the zone's resource management
- * configuration.
+ * Set pool info for the zone's resource management configuration.
  */
 static int
 setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 {
 	int res;
 	uint64_t tmp;
-	struct zone_mcaptab mcap;
 	char sched[MAXNAMELEN];
-	zone_dochandle_t handle = NULL;
 	char pool_err[128];
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (Z_BAD_HANDLE);
-	}
-
-	if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		zonecfg_fini_handle(handle);
-		return (res);
-	}
-
-	/*
-	 * If a memory cap is configured, set the cap in the kernel using
-	 * zone_setattr() and make sure the rcapd SMF service is enabled.
-	 */
-	if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
-		uint64_t num;
-		char smf_err[128];
-
-		num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10);
-		if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
-			zerror(zlogp, B_TRUE, "could not set zone memory cap");
-			zonecfg_fini_handle(handle);
-			return (Z_INVAL);
-		}
-
-		if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
-			zerror(zlogp, B_FALSE, "enabling system/rcap service "
-			    "failed: %s", smf_err);
-			zonecfg_fini_handle(handle);
-			return (Z_INVAL);
-		}
-	}
-
 	/* Get the scheduling class set in the zone configuration. */
-	if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
+	if (zonecfg_get_sched_class(snap_hndl, sched, sizeof (sched)) == Z_OK &&
 	    strlen(sched) > 0) {
 		if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched,
 		    strlen(sched)) == -1)
 			zerror(zlogp, B_TRUE, "WARNING: unable to set the "
 			    "default scheduling class");
 
-	} else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp)
+		if (strcmp(sched, "FX") == 0) {
+			/*
+			 * When FX is specified then by default all processes
+			 * will start at the lowest priority level (0) and
+			 * stay there. We support an optional attr which
+			 * indicates that all the processes should be "high
+			 * priority". We set this on the zone so that starting
+			 * init will set the priority high.
+			 */
+			struct zone_attrtab a;
+
+			bzero(&a, sizeof (a));
+			(void) strlcpy(a.zone_attr_name, "fixed-hi-prio",
+			    sizeof (a.zone_attr_name));
+
+			if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
+			    strcmp(a.zone_attr_value, "true") == 0) {
+				boolean_t hi = B_TRUE;
+
+				if (zone_setattr(zoneid,
+				    ZONE_ATTR_SCHED_FIXEDHI, (void *)hi,
+				    sizeof (hi)) == -1)
+					zerror(zlogp, B_TRUE, "WARNING: unable "
+					    "to set high priority");
+			}
+		}
+
+	} else if (zonecfg_get_aliased_rctl(snap_hndl, ALIAS_SHARES, &tmp)
 	    == Z_OK) {
 		/*
 		 * If the zone has the zone.cpu-shares rctl set then we want to
@@ -4455,7 +4383,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 		 */
 		char class_name[PC_CLNMSZ];
 
-		if (zonecfg_get_dflt_sched_class(handle, class_name,
+		if (zonecfg_get_dflt_sched_class(snap_hndl, class_name,
 		    sizeof (class_name)) != Z_OK) {
 			zerror(zlogp, B_FALSE, "WARNING: unable to determine "
 			    "the zone's scheduling class");
@@ -4488,7 +4416,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 	 * right thing in all cases (reuse or create) based on the current
 	 * zonecfg.
 	 */
-	if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
+	if ((res = zonecfg_bind_tmp_pool(snap_hndl, zoneid, pool_err,
 	    sizeof (pool_err))) != Z_OK) {
 		if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND)
 			zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting "
@@ -4497,14 +4425,13 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 		else
 			zerror(zlogp, B_FALSE, "could not bind zone to "
 			    "temporary pool: %s", zonecfg_strerror(res));
-		zonecfg_fini_handle(handle);
 		return (Z_POOL_BIND);
 	}
 
 	/*
 	 * Check if we need to warn about poold not being enabled.
 	 */
-	if (zonecfg_warn_poold(handle)) {
+	if (zonecfg_warn_poold(snap_hndl)) {
 		zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has "
 		    "been specified\nbut the dynamic pool service is not "
 		    "enabled.\nThe system will not dynamically adjust the\n"
@@ -4514,7 +4441,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 	}
 
 	/* The following is a warning, not an error. */
-	if ((res = zonecfg_bind_pool(handle, zoneid, pool_err,
+	if ((res = zonecfg_bind_pool(snap_hndl, zoneid, pool_err,
 	    sizeof (pool_err))) != Z_OK) {
 		if (res == Z_POOL_BIND)
 			zerror(zlogp, B_FALSE, "WARNING: unable to bind to "
@@ -4528,10 +4455,9 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
 	}
 
 	/* Update saved pool name in case it has changed */
-	(void) zonecfg_get_poolname(handle, zone_name, pool_name,
+	(void) zonecfg_get_poolname(snap_hndl, zone_name, pool_name,
 	    sizeof (pool_name));
 
-	zonecfg_fini_handle(handle);
 	return (Z_OK);
 }
 
@@ -4632,33 +4558,28 @@ setup_zone_fs_allowed(zone_dochandle_t handle, zlog_t *zlogp, zoneid_t zoneid)
 }
 
 static int
-setup_zone_attrs(zlog_t *zlogp, char *zone_namep, zoneid_t zoneid)
+setup_zone_attrs(zlog_t *zlogp, zoneid_t zoneid)
 {
-	zone_dochandle_t handle;
 	int res = Z_OK;
 
-	if ((handle = zonecfg_init_handle()) == NULL) {
-		zerror(zlogp, B_TRUE, "getting zone configuration handle");
-		return (Z_BAD_HANDLE);
-	}
-	if ((res = zonecfg_get_snapshot_handle(zone_namep, handle)) != Z_OK) {
-		zerror(zlogp, B_FALSE, "invalid configuration");
-		goto out;
-	}
-
-	if ((res = setup_zone_hostid(handle, zlogp, zoneid)) != Z_OK)
+	if ((res = setup_zone_hostid(snap_hndl, zlogp, zoneid)) != Z_OK)
 		goto out;
 
-	if ((res = setup_zone_fs_allowed(handle, zlogp, zoneid)) != Z_OK)
+	if ((res = setup_zone_fs_allowed(snap_hndl, zlogp, zoneid)) != Z_OK)
 		goto out;
 
 out:
-	zonecfg_fini_handle(handle);
 	return (res);
 }
 
+/*
+ * The zone_did is a persistent debug ID.  Each zone should have a unique ID
+ * in the kernel.  This is used for things like DTrace which want to monitor
+ * zones across reboots.  They can't use the zoneid since that changes on
+ * each boot.
+ */
 zoneid_t
-vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
+vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zone_did)
 {
 	zoneid_t rval = -1;
 	priv_set_t *privs;
@@ -4674,7 +4595,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	tsol_zcent_t *zcent = NULL;
 	int match = 0;
 	int doi = 0;
-	int flags;
+	int flags = -1;
 	zone_iptype_t iptype;
 
 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
@@ -4696,6 +4617,8 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
 		flags = ZCF_NET_EXCL;
 		break;
 	}
+	if (flags == -1)
+		abort();
 
 	if ((privs = priv_allocset()) == NULL) {
 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
@@ -4799,7 +4722,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
 	xerr = 0;
 	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
 	    rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel,
-	    flags)) == -1) {
+	    flags, zone_did)) == -1) {
 		if (xerr == ZE_AREMOUNTS) {
 			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
 				zerror(zlogp, B_FALSE,
@@ -4845,7 +4768,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
 		struct brand_attr attr;
 		char modname[MAXPATHLEN];
 
-		if (setup_zone_attrs(zlogp, zone_name, zoneid) != Z_OK)
+		if (setup_zone_attrs(zlogp, zoneid) != Z_OK)
 			goto error;
 
 		if ((bh = brand_open(brand_name)) == NULL) {
@@ -4903,6 +4826,8 @@ error:
 	}
 	if (rctlbuf != NULL)
 		free(rctlbuf);
+	if (zfsbuf != NULL)
+		free(zfsbuf);
 	priv_freeset(privs);
 	if (fp != NULL)
 		zonecfg_close_scratch(fp);
@@ -4991,7 +4916,7 @@ write_index_file(zoneid_t zoneid)
 int
 vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
 {
-	char zonepath[MAXPATHLEN];
+	char zpath[MAXPATHLEN];
 
 	if (mount_cmd == Z_MNT_BOOT && validate_datasets(zlogp) != 0) {
 		lofs_discard_mnttab();
@@ -5002,15 +4927,11 @@ vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
 	 * Before we try to mount filesystems we need to create the
 	 * attribute backing store for /dev
 	 */
-	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
-		lofs_discard_mnttab();
-		return (-1);
-	}
-	resolve_lofs(zlogp, zonepath, sizeof (zonepath));
+	(void) strlcpy(zpath, zonepath, sizeof (zpath));
+	resolve_lofs(zlogp, zpath, sizeof (zpath));
 
 	/* Make /dev directory owned by root, grouped sys */
-	if (make_one_dir(zlogp, zonepath, "/dev", DEFAULT_DIR_MODE,
-	    0, 3) != 0) {
+	if (make_one_dir(zlogp, zpath, "/dev", DEFAULT_DIR_MODE, 0, 3) != 0) {
 		lofs_discard_mnttab();
 		return (-1);
 	}
@@ -5045,6 +4966,8 @@ vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
 				return (-1);
 			}
 			break;
+		default:
+			abort();
 		}
 	}
 
@@ -5120,13 +5043,13 @@ unmounted:
 }
 
 int
-vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
+vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting,
+    boolean_t debug)
 {
 	char *kzone;
 	zoneid_t zoneid;
 	int res;
 	char pool_err[128];
-	char zpath[MAXPATHLEN];
 	char cmdbuf[MAXPATHLEN];
 	brand_handle_t bh = NULL;
 	dladm_status_t status;
@@ -5159,16 +5082,12 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 		goto error;
 	}
 
-	if (remove_datalink_pool(zlogp, zoneid) != 0) {
+	if (remove_datalink_pool(zlogp, zoneid) != 0)
 		zerror(zlogp, B_FALSE, "unable clear datalink pool property");
-		goto error;
-	}
 
-	if (remove_datalink_protect(zlogp, zoneid) != 0) {
+	if (remove_datalink_protect(zlogp, zoneid) != 0)
 		zerror(zlogp, B_FALSE,
 		    "unable clear datalink protect property");
-		goto error;
-	}
 
 	/*
 	 * The datalinks assigned to the zone will be removed from the NGZ as
@@ -5182,12 +5101,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 		goto error;
 	}
 
-	/* Get the zonepath of this zone */
-	if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
-		zerror(zlogp, B_FALSE, "unable to determine zone path");
-		goto error;
-	}
-
 	/* Get a handle to the brand info for this zone */
 	if ((bh = brand_open(brand_name)) == NULL) {
 		zerror(zlogp, B_FALSE, "unable to determine zone brand");
@@ -5198,7 +5111,7 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 	 * brand a chance to cleanup any custom configuration.
 	 */
 	(void) strcpy(cmdbuf, EXEC_PREFIX);
-	if (brand_get_halt(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+	if (brand_get_halt(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
 	    sizeof (cmdbuf) - EXEC_LEN) < 0) {
 		brand_close(bh);
 		zerror(zlogp, B_FALSE, "unable to determine branded zone's "
@@ -5208,7 +5121,7 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 	brand_close(bh);
 
 	if ((strlen(cmdbuf) > EXEC_LEN) &&
-	    (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
+	    (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
 		zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
 		goto error;
 	}
@@ -5240,12 +5153,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 			}
 			break;
 		case ZS_EXCLUSIVE:
-			if (unconfigure_exclusive_network_interfaces(zlogp,
-			    zoneid) != 0) {
-				zerror(zlogp, B_FALSE, "unable to unconfigure "
-				    "network interfaces in zone");
-				goto error;
-			}
 			status = dladm_zone_halt(dld_handle, zoneid);
 			if (status != DLADM_STATUS_OK) {
 				zerror(zlogp, B_FALSE, "unable to notify "
@@ -5282,14 +5189,9 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
 
 		if (rebooting) {
 			struct zone_psettab pset_tab;
-			zone_dochandle_t handle;
 
-			if ((handle = zonecfg_init_handle()) != NULL &&
-			    zonecfg_get_handle(zone_name, handle) == Z_OK &&
-			    zonecfg_lookup_pset(handle, &pset_tab) == Z_OK)
+			if (zonecfg_lookup_pset(snap_hndl, &pset_tab) == Z_OK)
 				destroy_tmp_pool = B_FALSE;
-
-			zonecfg_fini_handle(handle);
 		}
 
 		if (destroy_tmp_pool) {
diff --git a/usr/src/cmd/zoneadmd/zcons.c b/usr/src/cmd/zoneadmd/zcons.c
index 5f6fc4973c..af4fafe46a 100644
--- a/usr/src/cmd/zoneadmd/zcons.c
+++ b/usr/src/cmd/zoneadmd/zcons.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2012 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
@@ -118,9 +118,10 @@
 
 #define	CONSOLE_SOCKPATH	ZONES_TMPDIR "/%s.console_sock"
 
+#define	ZCONS_RETRY		10
+
 static int	serverfd = -1;	/* console server unix domain socket fd */
 char boot_args[BOOTARGS_MAX];
-char bad_boot_arg[BOOTARGS_MAX];
 
 /*
  * The eventstream is a simple one-directional flow of messages from the
@@ -130,7 +131,10 @@ char bad_boot_arg[BOOTARGS_MAX];
  */
 static int eventstream[2];
 
-
+/* flag used to cope with race creating master zcons devlink */
+static boolean_t master_zcons_failed = B_FALSE;
+/* flag to track if we've seen a state change when there is no master zcons */
+static boolean_t state_changed = B_FALSE;
 
 int
 eventstream_init()
@@ -322,7 +326,7 @@ destroy_console_devs(zlog_t *zlogp)
  * interfaces to instantiate a new zone console node.  We do a lot of
  * sanity checking, and are careful to reuse a console if one exists.
  *
- * Once the device is in the device tree, we kick devfsadm via di_init_devs()
+ * Once the device is in the device tree, we kick devfsadm via di_devlink_init()
  * to ensure that the appropriate symlinks (to the master and slave console
  * devices) are placed in /dev in the global zone.
  */
@@ -408,43 +412,63 @@ devlinks:
 	 * Open the master side of the console and issue the ZC_HOLDSLAVE ioctl,
 	 * which will cause the master to retain a reference to the slave.
 	 * This prevents ttymon from blowing through the slave's STREAMS anchor.
+	 *
+	 * In very rare cases the open returns ENOENT if devfs doesn't have
+	 * everything setup yet due to heavy zone startup load. Wait for
+	 * 1 sec. and retry a few times. Even if we can't setup the zone's
+	 * console, we still go ahead and boot the zone.
 	 */
 	(void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s",
 	    zone_name, ZCONS_MASTER_NAME);
-	if ((masterfd = open(conspath, O_RDWR | O_NOCTTY)) == -1) {
+	for (i = 0; i < ZCONS_RETRY; i++) {
+		masterfd = open(conspath, O_RDWR | O_NOCTTY);
+		if (masterfd >= 0 || errno != ENOENT)
+			break;
+		(void) sleep(1);
+	}
+	if (masterfd == -1) {
 		zerror(zlogp, B_TRUE, "ERROR: could not open master side of "
 		    "zone console for %s to acquire slave handle", zone_name);
-		goto error;
+		master_zcons_failed = B_TRUE;
 	}
+
 	(void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s",
 	    zone_name, ZCONS_SLAVE_NAME);
-	if ((slavefd = open(conspath, O_RDWR | O_NOCTTY)) == -1) {
+	for (i = 0; i < ZCONS_RETRY; i++) {
+		slavefd = open(conspath, O_RDWR | O_NOCTTY);
+		if (slavefd >= 0 || errno != ENOENT)
+			break;
+		(void) sleep(1);
+	}
+	if (slavefd == -1)
 		zerror(zlogp, B_TRUE, "ERROR: could not open slave side of zone"
 		    " console for %s to acquire slave handle", zone_name);
-		(void) close(masterfd);
-		goto error;
-	}
+
 	/*
 	 * This ioctl can occasionally return ENXIO if devfs doesn't have
 	 * everything plumbed up yet due to heavy zone startup load. Wait for
 	 * 1 sec. and retry a few times before we fail to boot the zone.
 	 */
-	for (i = 0; i < 5; i++) {
-		if (ioctl(masterfd, ZC_HOLDSLAVE, (caddr_t)(intptr_t)slavefd)
-		    == 0) {
-			rv = 0;
-			break;
-		} else if (errno != ENXIO) {
-			break;
+	if (masterfd != -1 && slavefd != -1) {
+		for (i = 0; i < ZCONS_RETRY; i++) {
+			if (ioctl(masterfd, ZC_HOLDSLAVE,
+			    (caddr_t)(intptr_t)slavefd) == 0) {
+				rv = 0;
+				break;
+			} else if (errno != ENXIO) {
+				break;
+			}
+			(void) sleep(1);
 		}
-		(void) sleep(1);
+		if (rv != 0)
+			zerror(zlogp, B_TRUE, "ERROR: error while acquiring "
+			    "slave handle of zone console for %s", zone_name);
 	}
-	if (rv != 0)
-		zerror(zlogp, B_TRUE, "ERROR: error while acquiring slave "
-		    "handle of zone console for %s", zone_name);
 
-	(void) close(slavefd);
-	(void) close(masterfd);
+	if (slavefd != -1)
+		(void) close(slavefd);
+	if (masterfd != -1)
+		(void) close(masterfd);
 
 error:
 	if (ddef_hdl)
@@ -517,6 +541,7 @@ get_client_ident(int clifd, pid_t *pid, char *locale, size_t locale_len,
 	size_t buflen = sizeof (buf);
 	char c = '\0';
 	int i = 0, r;
+	ucred_t *cred = NULL;
 
 	/* "eat up the ident string" case, for simplicity */
 	if (pid == NULL) {
@@ -550,18 +575,22 @@ get_client_ident(int clifd, pid_t *pid, char *locale, size_t locale_len,
 				break;
 	}
 
+	if (getpeerucred(clifd, &cred) == 0) {
+		*pid = ucred_getpid((const ucred_t *)cred);
+		ucred_free(cred);
+	} else {
+		return (-1);
+	}
+
 	/*
 	 * Parse buffer for message of the form:
-	 * IDENT <pid> <locale> <disconnect flag>
+	 * IDENT <locale> <disconnect flag>
 	 */
 	bufp = buf;
 	if (strncmp(bufp, "IDENT ", 6) != 0)
 		return (-1);
 	bufp += 6;
 	errno = 0;
-	*pid = strtoll(bufp, &bufp, 10);
-	if (errno != 0)
-		return (-1);
 
 	while (*bufp != '\0' && isspace(*bufp))
 		bufp++;
@@ -667,14 +696,6 @@ event_message(int clifd, char *clilocale, zone_evt_t evt, int dflag)
 		else
 			str = "NOTICE: Zone boot failed";
 		break;
-	case Z_EVT_ZONE_BADARGS:
-		/*LINTED*/
-		(void) snprintf(lmsg, sizeof (lmsg),
-		    localize_msg(clilocale,
-		    "WARNING: Ignoring invalid boot arguments: %s"),
-		    bad_boot_arg);
-		lstr = lmsg;
-		break;
 	default:
 		return;
 	}
@@ -878,7 +899,6 @@ init_console(zlog_t *zlogp)
 	if (init_console_dev(zlogp) == -1) {
 		zerror(zlogp, B_FALSE,
 		    "console setup: device initialization failed");
-		return (-1);
 	}
 
 	if ((serverfd = init_console_sock(zlogp)) == -1) {
@@ -890,6 +910,17 @@ init_console(zlog_t *zlogp)
 }
 
 /*
+ * Maintain a simple flag that tracks if we have seen at least one state
+ * change. This is currently only used to handle the special case where we are
+ * running without a console device, which is what normally drives shutdown.
+ */
+void
+zcons_statechanged()
+{
+	state_changed = B_TRUE;
+}
+
+/*
  * serve_console() is the master loop for driving console I/O.  It is also the
  * routine which is ultimately responsible for "pulling the plug" on zoneadmd
  * when it realizes that the daemon should shut down.
@@ -907,6 +938,7 @@ serve_console(zlog_t *zlogp)
 	int masterfd;
 	zone_state_t zstate;
 	char conspath[MAXPATHLEN];
+	static boolean_t cons_warned = B_FALSE;
 
 	(void) snprintf(conspath, sizeof (conspath),
 	    "/dev/zcons/%s/%s", zone_name, ZCONS_MASTER_NAME);
@@ -914,6 +946,46 @@ serve_console(zlog_t *zlogp)
 	for (;;) {
 		masterfd = open(conspath, O_RDWR|O_NONBLOCK|O_NOCTTY);
 		if (masterfd == -1) {
+			if (master_zcons_failed) {
+				/*
+				 * If we don't have a console and the zone is
+				 * not shutting down, there may have been a
+				 * race/failure with devfs while creating the
+				 * console. In this case we want to leave the
+				 * zone up, even without a console, so
+				 * periodically recheck.
+				 */
+				int i;
+
+				/*
+				 * In the normal flow of this loop, we use
+				 * do_console_io to give things a chance to get
+				 * going first. However, in this case we can't
+				 * use that, so we have to wait for at least
+				 * one state change before checking the state.
+				 */
+				for (i = 0; i < 60; i++) {
+					if (state_changed)
+						break;
+					(void) sleep(1);
+				}
+
+				if (i < 60 && zone_get_state(zone_name,
+				    &zstate) == Z_OK &&
+				    (zstate == ZONE_STATE_READY ||
+				    zstate == ZONE_STATE_RUNNING)) {
+					if (!cons_warned) {
+						zerror(zlogp, B_FALSE,
+						    "WARNING: missing zone "
+						    "console for %s",
+						    zone_name);
+						cons_warned = B_TRUE;
+					}
+					(void) sleep(ZCONS_RETRY);
+					continue;
+				}
+			}
+
 			zerror(zlogp, B_TRUE, "failed to open console master");
 			(void) mutex_lock(&lock);
 			goto death;
diff --git a/usr/src/cmd/zoneadmd/zfd.c b/usr/src/cmd/zoneadmd/zfd.c
new file mode 100644
index 0000000000..00278cd583
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/zfd.c
@@ -0,0 +1,1428 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Zone file descriptor support is used as a mechanism for a process inside the
+ * zone to log messages to the GZ zoneadmd and also as a way to interact
+ * directly with the process (via zlogin -I). The zfd thread is modeled on
+ * the zcons thread so see the comment header in zcons.c for a general overview.
+ * Unlike with zcons, which has a single endpoint within the zone and a single
+ * endpoint used by zoneadmd, we setup multiple endpoints within the zone.
+ *
+ * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat
+ * of a misnomer since its purpose has evolved. The attribute currently
+ * can have six values which are used to control:
+ *    - how the zfd devices are used inside the zone
+ *    - if the output on the device(s) is also teed into another stream within
+ *      the zone
+ *    - if we do logging in the GZ
+ * See the comment on get_mode_logmax() in this file, and the comment in
+ * uts/common/io/zfd.c for more details.
+ *
+ * Internally the zfd_mode_t struct holds the number of stdio devs (1 or 3),
+ * the number of additional devs corresponding to the zone attr value and the
+ * GZ logging flag.
+ *
+ * Note that although the mode indicates the number of devices needed, we always
+ * create all possible zfd devices for simplicity.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/termios.h>
+#include <sys/zfd.h>
+#include <sys/mkdev.h>
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <stropts.h>
+#include <thread.h>
+#include <ucred.h>
+#include <unistd.h>
+#include <zone.h>
+#include <signal.h>
+#include <wchar.h>
+
+#include <libdevinfo.h>
+#include <libdevice.h>
+#include <libzonecfg.h>
+
+#include <syslog.h>
+#include <sys/modctl.h>
+
+#include "zoneadmd.h"
+
+static zlog_t	*zlogp;
+static int	shutting_down = 0;
+static thread_t logger_tid;
+static int	logfd = -1;
+static size_t	log_sz = 0;
+static size_t	log_rot_sz = 0;
+
+static void rotate_log();
+
+/*
+ * The eventstream is a simple one-directional flow of messages implemented
+ * with a pipe. It is used to wake up the poller when it needs to shutdown.
+ */
+static int eventstream[2] = {-1, -1};
+
+#define	LOGNAME			"stdio.log"
+#define	ZLOG_MODE		"zlog-mode"
+#define	LOG_MAXSZ		"zlog-max-size"
+#define	ZFDNEX_DEVTREEPATH	"/pseudo/zfdnex@2"
+#define	ZFDNEX_FILEPATH		"/devices/pseudo/zfdnex@2"
+#define	SERVER_SOCKPATH		ZONES_TMPDIR "/%s.server_%s"
+#define	ZTTY_RETRY		5
+
+#define	NUM_ZFD_DEVS		5
+
+typedef struct zfd_mode {
+	uint_t		zmode_n_stddevs;
+	uint_t		zmode_n_addl_devs;
+	boolean_t	zmode_gzlogging;
+} zfd_mode_t;
+static zfd_mode_t mode;
+
+/*
+ * cb_data is only used by destroy_cb.
+ */
+struct cb_data {
+	zlog_t *zlogp;
+	int killed;
+};
+
+/*
+ * destroy_zfd_devs() and its helper destroy_cb() tears down any zfd instances
+ * associated with this zone. If things went very wrong, we might have an
+ * incorrect number of instances hanging around.  This routine hunts down and
+ * tries to remove all of them. Of course, if the fd is open, the instance will
+ * not detach, which is a potential issue.
+ */
+static int
+destroy_cb(di_node_t node, void *arg)
+{
+	struct cb_data *cb = (struct cb_data *)arg;
+	char *prop_data;
+	char *tmp;
+	char devpath[MAXPATHLEN];
+	devctl_hdl_t hdl;
+
+	if (di_prop_lookup_strings(DDI_DEV_T_ANY, node, "zfd_zname",
+	    &prop_data) == -1)
+		return (DI_WALK_CONTINUE);
+
+	assert(prop_data != NULL);
+	if (strcmp(prop_data, zone_name) != 0) {
+		/* this is a zfd for a different zone */
+		return (DI_WALK_CONTINUE);
+	}
+
+	tmp = di_devfs_path(node);
+	(void) snprintf(devpath, sizeof (devpath), "/devices/%s", tmp);
+	di_devfs_path_free(tmp);
+
+	if ((hdl = devctl_device_acquire(devpath, 0)) == NULL) {
+		zerror(cb->zlogp, B_TRUE, "WARNING: zfd %s found, "
+		    "but it could not be controlled.", devpath);
+		return (DI_WALK_CONTINUE);
+	}
+	if (devctl_device_remove(hdl) == 0) {
+		cb->killed++;
+	} else {
+		zerror(cb->zlogp, B_TRUE, "WARNING: zfd %s found, "
+		    "but it could not be removed.", devpath);
+	}
+	devctl_release(hdl);
+	return (DI_WALK_CONTINUE);
+}
+
+static int
+destroy_zfd_devs(zlog_t *zlogp)
+{
+	di_node_t root;
+	struct cb_data cb;
+
+	bzero(&cb, sizeof (cb));
+	cb.zlogp = zlogp;
+
+	if ((root = di_init(ZFDNEX_DEVTREEPATH, DINFOCPYALL)) == DI_NODE_NIL) {
+		zerror(zlogp, B_TRUE, "di_init failed");
+		return (-1);
+	}
+
+	(void) di_walk_node(root, DI_WALK_CLDFIRST, (void *)&cb, destroy_cb);
+
+	di_fini(root);
+	return (0);
+}
+
+static void
+make_tty(zlog_t *zlogp, int id)
+{
+	int i;
+	int fd = -1;
+	char stdpath[MAXPATHLEN];
+
+	/*
+	 * Open the master side of the dev and issue the ZFD_MAKETTY ioctl,
+	 * which will cause the the various tty-related streams modules to be
+	 * pushed when the slave opens the device.
+	 *
+	 * In very rare cases the open returns ENOENT if devfs doesn't have
+	 * everything setup yet due to heavy zone startup load. Wait for
+	 * 1 sec. and retry a few times. Even if we can't setup tty mode
+	 * we still move on.
+	 */
+	(void) snprintf(stdpath, sizeof (stdpath), "/dev/zfd/%s/master/%d",
+	    zone_name, id);
+
+	for (i = 0; !shutting_down && i < ZTTY_RETRY; i++) {
+		fd = open(stdpath, O_RDWR | O_NOCTTY);
+		if (fd >= 0 || errno != ENOENT)
+			break;
+		(void) sleep(1);
+	}
+	if (fd == -1) {
+		zerror(zlogp, B_TRUE, "ERROR: could not open zfd %d for "
+		    "zone %s to set tty mode", id, zone_name);
+	} else {
+		/*
+		 * This ioctl can occasionally return ENXIO if devfs doesn't
+		 * have everything plumbed up yet due to heavy zone startup
+		 * load. Wait for 1 sec. and retry a few times before we give
+		 * up.
+		 */
+		for (i = 0; !shutting_down && i < ZTTY_RETRY; i++) {
+			if (ioctl(fd, ZFD_MAKETTY) == 0) {
+				break;
+			} else if (errno != ENXIO) {
+				break;
+			}
+			(void) sleep(1);
+		}
+	}
+
+	if (fd != -1)
+		(void) close(fd);
+}
+
+/*
+ * init_zfd_devs() drives the device-tree configuration of the zone fd devices.
+ * The general strategy is to use the libdevice (devctl) interfaces to
+ * instantiate all of new zone fd nodes.  We do a lot of sanity checking, and
+ * are careful to reuse a dev if one exists.
+ *
+ * Once the devices are in the device tree, we kick devfsadm via
+ * di_devlink_init() to ensure that the appropriate symlinks (to the master and
+ * slave fd devices) are placed in /dev in the global zone.
+ */
+static int
+init_zfd_dev(zlog_t *zlogp, devctl_hdl_t bus_hdl, int id)
+{
+	int rv = -1;
+	devctl_ddef_t ddef_hdl = NULL;
+	devctl_hdl_t dev_hdl = NULL;
+
+	if ((ddef_hdl = devctl_ddef_alloc("zfd", 0)) == NULL) {
+		zerror(zlogp, B_TRUE, "failed to allocate ddef handle");
+		goto error;
+	}
+
+	/*
+	 * Set four properties on this node; the name of the zone, the dev name
+	 * seen inside the zone, a flag which lets pseudo know that it is OK to
+	 * automatically allocate an instance # for this device, and the last
+	 * one tells the device framework not to auto-detach this node - we
+	 * need the node to still be there when we ask devfsadmd to make links,
+	 * and when we need to open it.
+	 */
+	if (devctl_ddef_string(ddef_hdl, "zfd_zname", zone_name) == -1) {
+		zerror(zlogp, B_TRUE, "failed to create zfd_zname property");
+		goto error;
+	}
+	if (devctl_ddef_int(ddef_hdl, "zfd_id", id) == -1) {
+		zerror(zlogp, B_TRUE, "failed to create zfd_id property");
+		goto error;
+	}
+	if (devctl_ddef_int(ddef_hdl, "auto-assign-instance", 1) == -1) {
+		zerror(zlogp, B_TRUE, "failed to create auto-assign-instance "
+		    "property");
+		goto error;
+	}
+	if (devctl_ddef_int(ddef_hdl, "ddi-no-autodetach", 1) == -1) {
+		zerror(zlogp, B_TRUE, "failed to create ddi-no-auto-detach "
+		    "property");
+		goto error;
+	}
+	if (devctl_bus_dev_create(bus_hdl, ddef_hdl, 0, &dev_hdl) == -1) {
+		zerror(zlogp, B_TRUE, "failed to create zfd node");
+		goto error;
+	}
+	rv = 0;
+
+error:
+	if (ddef_hdl)
+		devctl_ddef_free(ddef_hdl);
+	if (dev_hdl)
+		devctl_release(dev_hdl);
+	return (rv);
+}
+
+static int
+init_zfd_devs(zlog_t *zlogp, zfd_mode_t *mode)
+{
+	devctl_hdl_t bus_hdl = NULL;
+	di_devlink_handle_t dl = NULL;
+	int rv = -1;
+	int i;
+
+	/*
+	 * Time to make the devices.
+	 */
+	if ((bus_hdl = devctl_bus_acquire(ZFDNEX_FILEPATH, 0)) == NULL) {
+		zerror(zlogp, B_TRUE, "devctl_bus_acquire failed");
+		goto error;
+	}
+
+	for (i = 0; i < NUM_ZFD_DEVS; i++) {
+		if (init_zfd_dev(zlogp, bus_hdl, i) != 0)
+			goto error;
+	}
+
+	if ((dl = di_devlink_init("zfd", DI_MAKE_LINK)) == NULL) {
+		zerror(zlogp, B_TRUE, "failed to create devlinks");
+		goto error;
+	}
+
+	(void) di_devlink_fini(&dl);
+	rv = 0;
+
+	if (mode->zmode_n_stddevs == 1) {
+		/* We want the primary stream to look like a tty. */
+		make_tty(zlogp, 0);
+	}
+
+error:
+	if (bus_hdl)
+		devctl_release(bus_hdl);
+	return (rv);
+}
+
+static int
+init_server_sock(zlog_t *zlogp, int *servfd, char *nm)
+{
+	int resfd = -1;
+	struct sockaddr_un servaddr;
+
+	bzero(&servaddr, sizeof (servaddr));
+	servaddr.sun_family = AF_UNIX;
+	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+	    SERVER_SOCKPATH, zone_name, nm);
+
+	if ((resfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+		zerror(zlogp, B_TRUE, "server setup: could not create socket");
+		goto err;
+	}
+	(void) unlink(servaddr.sun_path);
+
+	if (bind(resfd, (struct sockaddr *)&servaddr, sizeof (servaddr))
+	    == -1) {
+		zerror(zlogp, B_TRUE,
+		    "server setup: could not bind to socket");
+		goto err;
+	}
+
+	if (listen(resfd, 4) == -1) {
+		zerror(zlogp, B_TRUE,
+		    "server setup: could not listen on socket");
+		goto err;
+	}
+
+	*servfd = resfd;
+	return (0);
+
+err:
+	(void) unlink(servaddr.sun_path);
+	if (resfd != -1)
+		(void) close(resfd);
+	return (-1);
+}
+
+static void
+destroy_server_sock(int servfd, char *nm)
+{
+	char path[MAXPATHLEN];
+
+	(void) snprintf(path, sizeof (path), SERVER_SOCKPATH, zone_name, nm);
+	(void) unlink(path);
+	(void) shutdown(servfd, SHUT_RDWR);
+	(void) close(servfd);
+}
+
+/*
+ * Read the "ident" string from the client's descriptor; this routine also
+ * tolerates being called with pid=NULL, for times when you want to "eat"
+ * the ident string from a client without saving it.
+ */
+static int
+get_client_ident(int clifd, pid_t *pid, char *locale, size_t locale_len,
+    uint_t *flagsp)
+{
+	char buf[BUFSIZ], *bufp;
+	size_t buflen = sizeof (buf);
+	char c = '\0';
+	int i = 0, r;
+	ucred_t *cred = NULL;
+
+	/* "eat up the ident string" case, for simplicity */
+	if (pid == NULL) {
+		assert(locale == NULL && locale_len == 0);
+		while (read(clifd, &c, 1) == 1) {
+			if (c == '\n')
+				return (0);
+		}
+	}
+
+	bzero(buf, sizeof (buf));
+	while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) {
+		buflen--;
+		if (c == '\n')
+			break;
+
+		buf[i] = c;
+		i++;
+	}
+	if (r == -1)
+		return (-1);
+
+	/*
+	 * We've filled the buffer, but still haven't seen \n.  Keep eating
+	 * until we find it; we don't expect this to happen, but this is
+	 * defensive.
+	 */
+	if (c != '\n') {
+		while ((r = read(clifd, &c, sizeof (c))) > 0)
+			if (c == '\n')
+				break;
+	}
+
+	/*
+	 * Parse buffer for message of the form:
+	 * IDENT <locale> <flags>
+	 */
+	bufp = buf;
+	if (strncmp(bufp, "IDENT ", 6) != 0)
+		return (-1);
+	bufp += 6;
+
+	if (getpeerucred(clifd, &cred) == 0) {
+		*pid = ucred_getpid((const ucred_t *)cred);
+		ucred_free(cred);
+	} else {
+		return (-1);
+	}
+
+	while (*bufp != '\0' && isspace(*bufp))
+		bufp++;
+	buflen = strlen(bufp) - 1;
+	bufp[buflen - 1] = '\0';
+	(void) strlcpy(locale, bufp, locale_len);
+
+	*flagsp = atoi(&bufp[buflen]);
+
+	return (0);
+}
+
+static int
+accept_client(int servfd, pid_t *pid, char *locale, size_t locale_len,
+    uint_t *flagsp)
+{
+	int connfd;
+	struct sockaddr_un cliaddr;
+	socklen_t clilen;
+	int flags;
+
+	clilen = sizeof (cliaddr);
+	connfd = accept(servfd, (struct sockaddr *)&cliaddr, &clilen);
+	if (connfd == -1)
+		return (-1);
+	if (pid != NULL) {
+		if (get_client_ident(connfd, pid, locale, locale_len, flagsp)
+		    == -1) {
+			(void) shutdown(connfd, SHUT_RDWR);
+			(void) close(connfd);
+			return (-1);
+		}
+		(void) write(connfd, "OK\n", 3);
+	}
+
+	flags = fcntl(connfd, F_GETFL, 0);
+	if (flags != -1)
+		(void) fcntl(connfd, F_SETFL, flags | O_NONBLOCK | FD_CLOEXEC);
+
+	return (connfd);
+}
+
+static void
+reject_client(int servfd, pid_t clientpid)
+{
+	int connfd;
+	struct sockaddr_un cliaddr;
+	socklen_t clilen;
+	char nak[MAXPATHLEN];
+
+	clilen = sizeof (cliaddr);
+	connfd = accept(servfd, (struct sockaddr *)&cliaddr, &clilen);
+
+	/*
+	 * After getting its ident string, tell client to get lost.
+	 */
+	if (get_client_ident(connfd, NULL, NULL, 0, NULL) == 0) {
+		(void) snprintf(nak, sizeof (nak), "%lu\n",
+		    clientpid);
+		(void) write(connfd, nak, strlen(nak));
+	}
+	(void) shutdown(connfd, SHUT_RDWR);
+	(void) close(connfd);
+}
+
+static int
+accept_socket(int servfd, pid_t verpid)
+{
+	int connfd;
+	struct sockaddr_un cliaddr;
+	socklen_t clilen = sizeof (cliaddr);
+	ucred_t *cred = NULL;
+	pid_t rpid = -1;
+	int flags;
+
+	connfd = accept(servfd, (struct sockaddr *)&cliaddr, &clilen);
+	if (connfd == -1)
+		return (-1);
+
+	/* Confirm connecting process is who we expect */
+	if (getpeerucred(connfd, &cred) == 0) {
+		rpid = ucred_getpid((const ucred_t *)cred);
+		ucred_free(cred);
+	}
+	if (rpid == -1 || rpid != verpid) {
+		(void) shutdown(connfd, SHUT_RDWR);
+		(void) close(connfd);
+		return (-1);
+	}
+
+	flags = fcntl(connfd, F_GETFL, 0);
+	if (flags != -1)
+		(void) fcntl(connfd, F_SETFL, flags | O_NONBLOCK | FD_CLOEXEC);
+
+	return (connfd);
+}
+
+static void
+ctlcmd_process(int sockfd, int stdoutfd, unsigned int *flags)
+{
+	char buf[BUFSIZ];
+	int i;
+	for (i = 0; i < BUFSIZ-1; i++) {
+		char c;
+		if (read(sockfd, &c, 1) != 1 ||
+		    c == '\n' || c == '\0') {
+			break;
+		}
+		buf[i] = c;
+	}
+	if (i == 0) {
+		goto fail;
+	}
+	buf[i+1] = '\0';
+
+	if (strncmp(buf, "TIOCSWINSZ ", 11) == 0) {
+		char *next = buf + 11;
+		struct winsize ws;
+		errno = 0;
+		ws.ws_row = strtol(next, &next, 10);
+		if (errno == EINVAL) {
+			goto fail;
+		}
+		ws.ws_col = strtol(next + 1, &next, 10);
+		if (errno == EINVAL) {
+			goto fail;
+		}
+		if (ioctl(stdoutfd, TIOCSWINSZ, &ws) == 0) {
+			(void) write(sockfd, "OK\n", 3);
+			return;
+		}
+	}
+	if (strncmp(buf, "SETFLAGS ", 9) == 0) {
+		char *next = buf + 9;
+		unsigned int result;
+		errno = 0;
+		result = strtoul(next, &next, 10);
+		if (errno == EINVAL) {
+			goto fail;
+		}
+		*flags = result;
+		(void) write(sockfd, "OK\n", 3);
+		return;
+	}
+fail:
+	(void) write(sockfd, "FAIL\n", 5);
+}
+
+/*
+ * Check to see if the client at the other end of the socket is still alive; we
+ * know it is not if it throws EPIPE at us when we try to write an otherwise
+ * harmless 0-length message to it.
+ */
+static int
+test_client(int clifd)
+{
+	if ((write(clifd, "", 0) == -1) && errno == EPIPE)
+		return (-1);
+	return (0);
+}
+
+/*
+ * Modify the input string with json escapes. Since the destination can thus
+ * be larger than the source, it may get truncated, although we do use a
+ * larger buffer.
+ */
+static void
+escape_json(char *sbuf, int slen, char *dbuf, int dlen)
+{
+	int i;
+	mbstate_t mbr;
+	wchar_t c;
+	size_t sz;
+
+	bzero(&mbr, sizeof (mbr));
+
+	sbuf[slen] = '\0';
+	i = 0;
+	while (i < dlen && (sz = mbrtowc(&c, sbuf, MB_CUR_MAX, &mbr)) > 0) {
+		switch (c) {
+		case '\\':
+			dbuf[i++] = '\\';
+			dbuf[i++] = '\\';
+			break;
+
+		case '"':
+			dbuf[i++] = '\\';
+			dbuf[i++] = '"';
+			break;
+
+		case '\b':
+			dbuf[i++] = '\\';
+			dbuf[i++] = 'b';
+			break;
+
+		case '\f':
+			dbuf[i++] = '\\';
+			dbuf[i++] = 'f';
+			break;
+
+		case '\n':
+			dbuf[i++] = '\\';
+			dbuf[i++] = 'n';
+			break;
+
+		case '\r':
+			dbuf[i++] = '\\';
+			dbuf[i++] = 'r';
+			break;
+
+		case '\t':
+			dbuf[i++] = '\\';
+			dbuf[i++] = 't';
+			break;
+
+		default:
+			if ((c >= 0x00 && c <= 0x1f) ||
+			    (c > 0x7f && c <= 0xffff)) {
+
+				i += snprintf(&dbuf[i], (dlen - i), "\\u%04x",
+				    (int)(0xffff & c));
+			} else if (c >= 0x20 && c <= 0x7f) {
+				dbuf[i++] = 0xff & c;
+			}
+
+			break;
+		}
+		sbuf += sz;
+	}
+
+	if (i == dlen)
+		dbuf[--i] = '\0';
+	else
+		dbuf[i] = '\0';
+}
+
+/*
+ * We output to the log file as json.
+ * ex. for string 'msg\n' on the zone's stdout:
+ *    {"log":"msg\n","stream":"stdout","time":"2014-10-24T20:12:11.101973117Z"}
+ *
+ * We use ns in the last field of the timestamp for compatability.
+ *
+ * We keep track of the size of the log file and rotate it when we exceed
+ * the log size limit (if one is set).
+ */
+static void
+wr_log_msg(char *buf, int len, int from)
+{
+	struct timeval tv;
+	int olen;
+	char ts[64];
+	char nbuf[BUFSIZ * 2];
+	char obuf[BUFSIZ * 2];
+	static boolean_t log_wr_err = B_FALSE;
+
+	if (logfd == -1)
+		return;
+
+	escape_json(buf, len, nbuf, sizeof (nbuf));
+
+	if (gettimeofday(&tv, NULL) != 0)
+		return;
+	(void) strftime(ts, sizeof (ts), "%FT%T", gmtime(&tv.tv_sec));
+
+	olen = snprintf(obuf, sizeof (obuf),
+	    "{\"log\":\"%s\",\"stream\":\"%s\",\"time\":\"%s.%ldZ\"}\n",
+	    nbuf, (from == 1) ? "stdout" : "stderr", ts, tv.tv_usec * 1000);
+
+	if (write(logfd, obuf, olen) != olen) {
+		if (!log_wr_err) {
+			zerror(zlogp, B_TRUE, "log file write error");
+			log_wr_err = B_TRUE;
+		}
+		return;
+	}
+
+	log_sz += olen;
+	if (log_rot_sz > 0 && log_sz >= log_rot_sz)
+		rotate_log();
+}
+
+/*
+ * We want to sleep for a little while but need to be responsive if the zone is
+ * halting. We poll/sleep on the event stream so we can notice if we're halting.
+ * Return true if halting, otherwise false.
+ */
+static boolean_t
+halt_sleep(int slptime)
+{
+	struct pollfd evfd[1];
+
+	evfd[0].fd = eventstream[1];
+	evfd[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI;
+
+	if (poll(evfd, 1, slptime) > 0) {
+		/* zone halting */
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * This routine drives the logging and interactive I/O loop. It polls for
+ * input from the zone side of the fd (output to stdout/stderr), and from the
+ * client (input to the zone's stdin).  Additionally, it polls on the server
+ * fd, and disconnects any clients that might try to hook up with the zone
+ * while the fd's are in use.
+ *
+ * Data from the zone's stdout and stderr is formatted in json and written to
+ * the log file whether an interactive client is connected or not.
+ *
+ * When the client first calls us up, it is expected to send a line giving its
+ * "identity"; this consists of the string 'IDENT <pid> <locale>'. This is so
+ * that we can report that the fd's are busy, along with some diagnostics
+ * about who has them busy; the locale is ignore here but kept for compatability
+ * with the zlogin code when running on the zone's console.
+ *
+ * We need to handle the case where there is no server within the zone (or
+ * the server gets stuck) and data that we're writing to the zone server's
+ * stdin fills the pipe. Because of the way the zfd device works writes can
+ * flow into the stream and simply be dropped, if there is no server, or writes
+ * could return -1 with EAGAIN if the server is stuck. Since we ignore errors
+ * on the write to stdin, we won't get blocked in that case but we'd like to
+ * avoid dropping initial input if the server within the zone hasn't started
+ * yet. To handle this we wait to read initial input until we detect that there
+ * is a server inside the zone. We have to poll for this so that we can
+ * re-run the ioctl to notice when a server shows up. This poll/wait is handled
+ * by halt_sleep() so that we can be responsive if the zone wants to halt.
+ * We only do this check to avoid dropping initial input so it is possible for
+ * the server within the zone to go away later. At that point zfd will just
+ * drop any new input flowing into the stream.
+ */
+static void
+do_zfd_io(int gzctlfd, int gzservfd, int gzerrfd, int stdinfd, int stdoutfd,
+    int stderrfd)
+{
+	struct pollfd pollfds[8];
+	char ibuf[BUFSIZ + 1];
+	int cc, ret;
+	int ctlfd = -1;
+	int clifd = -1;
+	int clierrfd = -1;
+	int pollerr = 0;
+	char clilocale[MAXPATHLEN];
+	pid_t clipid = 0;
+	uint_t flags = 0;
+	boolean_t stdin_ready = B_FALSE;
+	int slptime = 250;	/* initial poll sleep time in ms */
+
+	/* client control socket, watch for read events */
+	pollfds[0].fd = ctlfd;
+	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
+	    POLLPRI | POLLERR | POLLHUP | POLLNVAL;
+
+	/* client socket, watch for read events */
+	pollfds[1].fd = clifd;
+	pollfds[1].events = pollfds[0].events;
+
+	/* stdout, watch for read events */
+	pollfds[2].fd = stdoutfd;
+	pollfds[2].events = pollfds[0].events;
+
+	/* stderr, watch for read events */
+	pollfds[3].fd = stderrfd;
+	pollfds[3].events = pollfds[0].events;
+
+	/* the server control socket; watch for new connections */
+	pollfds[4].fd = gzctlfd;
+	pollfds[4].events = POLLIN | POLLRDNORM;
+
+	/* the server stdin/out socket; watch for new connections */
+	pollfds[5].fd = gzservfd;
+	pollfds[5].events = POLLIN | POLLRDNORM;
+
+	/* the server stderr socket; watch for new connections */
+	pollfds[6].fd = gzerrfd;
+	pollfds[6].events = POLLIN | POLLRDNORM;
+
+	/* the eventstream; any input means the zone is halting */
+	pollfds[7].fd = eventstream[1];
+	pollfds[7].events = pollfds[0].events;
+
+	while (!shutting_down) {
+		pollfds[0].revents = pollfds[1].revents = 0;
+		pollfds[2].revents = pollfds[3].revents = 0;
+		pollfds[4].revents = pollfds[5].revents = 0;
+		pollfds[6].revents = pollfds[7].revents = 0;
+
+		ret = poll(pollfds, 8, -1);
+		if (ret == -1 && errno != EINTR) {
+			zerror(zlogp, B_TRUE, "poll failed");
+			/* we are hosed, close connection */
+			break;
+		}
+
+		/* control events from client */
+		if (pollfds[0].revents &
+		    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+			/* process control message */
+			ctlcmd_process(ctlfd, stdoutfd, &flags);
+		} else if (pollfds[0].revents) {
+			/* bail if any error occurs */
+			pollerr = pollfds[0].revents;
+			zerror(zlogp, B_FALSE, "closing connection "
+			    "with control channel, pollerr %d\n", pollerr);
+			break;
+		}
+
+		/* event from client side */
+		if (pollfds[1].revents) {
+			if (stdin_ready) {
+				if (pollfds[1].revents & (POLLIN |
+				    POLLRDNORM | POLLRDBAND | POLLPRI)) {
+					errno = 0;
+					cc = read(clifd, ibuf, BUFSIZ);
+					if (cc > 0) {
+						/*
+						 * See comment for this
+						 * function on what happens if
+						 * there is no reader in the
+						 * zone. EOF is handled below.
+						 */
+						(void) write(stdinfd, ibuf, cc);
+					}
+				} else if (pollfds[1].revents & (POLLERR |
+				    POLLNVAL))  {
+					pollerr = pollfds[1].revents;
+					zerror(zlogp, B_FALSE,
+					    "closing connection "
+					    "with client, pollerr %d\n",
+					    pollerr);
+					break;
+				}
+
+				if (pollfds[1].revents & POLLHUP) {
+					if (flags & ZLOGIN_ZFD_EOF) {
+						/*
+						 * Let the client know. We've
+						 * already serviced any pending
+						 * regular input. Let the
+						 * stream clear since the EOF
+						 * ioctl jumps to the head.
+						 */
+						(void) ioctl(stdinfd, I_FLUSH);
+						if (halt_sleep(250))
+							break;
+						(void) ioctl(stdinfd, ZFD_EOF);
+					}
+					break;
+				}
+			} else {
+				if (ioctl(stdinfd, ZFD_HAS_SLAVE) == 0) {
+					stdin_ready = B_TRUE;
+				} else {
+					/*
+					 * There is nothing in the zone to read
+					 * our input. Presumably the user
+					 * providing input expects something to
+					 * show up, but that is no guarantee.
+					 * Since we haven't serviced the pending
+					 * input poll yet, we don't want to
+					 * immediately loop around but we also
+					 * need to be responsive if the zone is
+					 * halting.
+					 */
+					if (halt_sleep(slptime))
+						break;
+
+					if (slptime < 5000)
+						slptime += 250;
+				}
+			}
+		}
+
+		/* event from the zone's stdout */
+		if (pollfds[2].revents) {
+			if (pollfds[2].revents &
+			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+				errno = 0;
+				cc = read(stdoutfd, ibuf, BUFSIZ);
+				if (cc <= 0 && (errno != EINTR) &&
+				    (errno != EAGAIN))
+					break;
+				if (cc > 0) {
+					wr_log_msg(ibuf, cc, 1);
+
+					/*
+					 * Lose output if no one is listening,
+					 * otherwise pass it on.
+					 */
+					if (clifd != -1)
+						(void) write(clifd, ibuf, cc);
+				}
+			} else {
+				pollerr = pollfds[2].revents;
+				zerror(zlogp, B_FALSE,
+				    "closing connection with stdout zfd, "
+				    "pollerr %d\n", pollerr);
+				break;
+			}
+		}
+
+		/* event from the zone's stderr */
+		if (pollfds[3].revents) {
+			if (pollfds[3].revents &
+			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+				errno = 0;
+				cc = read(stderrfd, ibuf, BUFSIZ);
+				if (cc <= 0 && (errno != EINTR) &&
+				    (errno != EAGAIN))
+					break;
+				if (cc > 0) {
+					wr_log_msg(ibuf, cc, 2);
+
+					/*
+					 * Lose output if no one is listening,
+					 * otherwise pass it on.
+					 */
+					if (clierrfd != -1)
+						(void) write(clierrfd, ibuf,
+						    cc);
+				}
+			} else {
+				pollerr = pollfds[3].revents;
+				zerror(zlogp, B_FALSE,
+				    "closing connection with stderr zfd, "
+				    "pollerr %d\n", pollerr);
+				break;
+			}
+		}
+
+		/* connect event from server control socket */
+		if (pollfds[4].revents) {
+			if (ctlfd != -1) {
+				/*
+				 * Test the client to see if it is really
+				 * still alive.  If it has died but we
+				 * haven't yet detected that, we might
+				 * deny a legitimate connect attempt.  If it
+				 * is dead, we break out; once we tear down
+				 * the old connection, the new connection
+				 * will happen.
+				 */
+				if (test_client(ctlfd) == -1) {
+					break;
+				}
+				/* we're already handling a client */
+				reject_client(gzctlfd, clipid);
+			} else {
+				ctlfd = accept_client(gzctlfd, &clipid,
+				    clilocale, sizeof (clilocale), &flags);
+				if (ctlfd != -1) {
+					pollfds[0].fd = ctlfd;
+				} else {
+					break;
+				}
+			}
+		}
+
+		/* connect event from server stdin/out socket */
+		if (pollfds[5].revents) {
+			if (ctlfd == -1) {
+				/*
+				 * This shouldn't happen since the client is
+				 * expected to connect on the control socket
+				 * first. If we see this, tear everything down
+				 * and start over.
+				 */
+				zerror(zlogp, B_FALSE, "GZ zfd stdin/stdout "
+				    "connection attempt with no GZ control\n");
+				break;
+			}
+			assert(clifd == -1);
+			if ((clifd = accept_socket(gzservfd, clipid)) != -1) {
+				/* No need to watch for other new connections */
+				pollfds[5].fd = -1;
+				/* Client input is of interest, though */
+				pollfds[1].fd = clifd;
+			} else {
+				break;
+			}
+		}
+
+		/* connection event from server stderr socket */
+		if (pollfds[6].revents) {
+			if (ctlfd == -1) {
+				/*
+				 * Same conditions apply to stderr as stdin/out.
+				 */
+				zerror(zlogp, B_FALSE, "GZ zfd stderr "
+				    "connection attempt with no GZ control\n");
+				break;
+			}
+			assert(clierrfd == -1);
+			if ((clierrfd = accept_socket(gzerrfd, clipid)) != -1) {
+				/* No need to watch for other new connections */
+				pollfds[6].fd = -1;
+			} else {
+				break;
+			}
+		}
+
+		/*
+		 * Watch for events on the eventstream.  This is how we get
+		 * notified of the zone halting, etc.  It provides us a
+		 * "wakeup" from poll when important things happen, which
+		 * is good.
+		 */
+		if (pollfds[7].revents) {
+			break;
+		}
+	}
+
+	if (clifd != -1) {
+		(void) shutdown(clifd, SHUT_RDWR);
+		(void) close(clifd);
+	}
+
+	if (clierrfd != -1) {
+		(void) shutdown(clierrfd, SHUT_RDWR);
+		(void) close(clierrfd);
+	}
+}
+
+static int
+open_fd(zlog_t *zlogp, int id, int rw)
+{
+	int fd;
+	int flag = O_NONBLOCK | O_NOCTTY | O_CLOEXEC;
+	int retried = 0;
+	char stdpath[MAXPATHLEN];
+
+	(void) snprintf(stdpath, sizeof (stdpath), "/dev/zfd/%s/master/%d",
+	    zone_name, id);
+	flag |= rw;
+
+	while (!shutting_down) {
+		if ((fd = open(stdpath, flag)) != -1) {
+			/*
+			 * Setting RPROTDIS on the stream means that the
+			 * control portion of messages received (which we don't
+			 * care about) will be discarded by the stream head. If
+			 * we allowed such messages, we wouldn't be able to use
+			 * read(2), as it fails (EBADMSG) when a message with a
+			 * control element is received.
+			 */
+			if (ioctl(fd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+				zerror(zlogp, B_TRUE,
+				    "failed to set options on zfd");
+				return (-1);
+			}
+			return (fd);
+		}
+
+		if (retried++ > 60)
+			break;
+
+		(void) sleep(1);
+	}
+
+	zerror(zlogp, B_TRUE, "failed to open zfd");
+	return (-1);
+}
+
+static void
+open_logfile()
+{
+	char logpath[MAXPATHLEN];
+
+	logfd = -1;
+	log_sz = 0;
+
+	(void) snprintf(logpath, sizeof (logpath), "%s/logs", zonepath);
+	(void) mkdir(logpath, 0700);
+
+	(void) snprintf(logpath, sizeof (logpath), "%s/logs/%s", zonepath,
+	    LOGNAME);
+
+	if ((logfd = open(logpath, O_WRONLY | O_APPEND | O_CREAT,
+	    0600)) == -1) {
+		zerror(zlogp, B_TRUE, "failed to open log file");
+	} else {
+		struct stat64 sb;
+
+		if (fstat64(logfd, &sb) == 0)
+			log_sz = sb.st_size;
+	}
+}
+
+static void
+rotate_log()
+{
+	time_t t;
+	struct tm gtm;
+	char onm[MAXPATHLEN], rnm[MAXPATHLEN];
+
+	if ((t = time(NULL)) == (time_t)-1 || gmtime_r(&t, &gtm) == NULL) {
+		zerror(zlogp, B_TRUE, "failed to format time");
+		return;
+	}
+
+	(void) snprintf(rnm, sizeof (rnm),
+	    "%s/logs/%s.%d%02d%02dT%02d%02d%02dZ",
+	    zonepath, LOGNAME, gtm.tm_year + 1900, gtm.tm_mon + 1, gtm.tm_mday,
+	    gtm.tm_hour, gtm.tm_min, gtm.tm_sec);
+	(void) snprintf(onm, sizeof (onm), "%s/logs/%s", zonepath, LOGNAME);
+
+	(void) close(logfd);
+	if (rename(onm, rnm) != 0)
+		zerror(zlogp, B_TRUE, "failed to rotate log file");
+	open_logfile();
+}
+
+
+/* ARGSUSED */
+void
+hup_handler(int i)
+{
+	if (logfd != -1) {
+		(void) close(logfd);
+		open_logfile();
+	}
+}
+
+/*
+ * Body of the worker thread to log the zfd's stdout and stderr to a log file
+ * and to perform interactive IO to the stdin, stdout and stderr zfd's.
+ *
+ * The stdin, stdout and stderr are from the perspective of the process inside
+ * the zone, so the zoneadmd view is opposite (i.e. we write to the stdin fd
+ * and read from the stdout/stderr fds).
+ */
+static void
+srvr(void *modearg)
+{
+	zfd_mode_t *mode = (zfd_mode_t *)modearg;
+	int gzctlfd = -1;
+	int gzoutfd = -1;
+	int stdinfd = -1;
+	int stdoutfd = -1;
+	sigset_t blockset;
+	int gzerrfd = -1;
+	int stderrfd = -1;
+	int flags;
+	int len;
+	char ibuf[BUFSIZ + 1];
+
+	if (!shutting_down && mode->zmode_gzlogging)
+		open_logfile();
+
+	/*
+	 * This thread should receive SIGHUP so that it can close the log
+	 * file, and reopen it, during log rotation.
+	 */
+	sigset(SIGHUP, hup_handler);
+	(void) sigfillset(&blockset);
+	(void) sigdelset(&blockset, SIGHUP);
+	(void) thr_sigsetmask(SIG_BLOCK, &blockset, NULL);
+
+	if (!shutting_down) {
+		if (pipe(eventstream) != 0) {
+			zerror(zlogp, B_TRUE, "failed to open logger control "
+			    "pipe");
+			return;
+		}
+	}
+
+	while (!shutting_down) {
+		if (init_server_sock(zlogp, &gzctlfd, "ctl") == -1) {
+			zerror(zlogp, B_FALSE,
+			    "server setup: control socket init failed");
+			goto death;
+		}
+		if (init_server_sock(zlogp, &gzoutfd, "out") == -1) {
+			zerror(zlogp, B_FALSE,
+			    "server setup: stdout socket init failed");
+			goto death;
+		}
+		if (init_server_sock(zlogp, &gzerrfd, "err") == -1) {
+			zerror(zlogp, B_FALSE,
+			    "server setup: stderr socket init failed");
+			goto death;
+		}
+
+		if (mode->zmode_n_stddevs == 1) {
+			if ((stdinfd = open_fd(zlogp, 0, O_RDWR)) == -1) {
+				goto death;
+			}
+			stdoutfd = stdinfd;
+		} else {
+			if ((stdinfd = open_fd(zlogp, 0, O_WRONLY)) == -1 ||
+			    (stdoutfd = open_fd(zlogp, 1, O_RDONLY)) == -1 ||
+			    (stderrfd = open_fd(zlogp, 2, O_RDONLY)) == -1) {
+				goto death;
+			}
+		}
+
+		do_zfd_io(gzctlfd, gzoutfd, gzerrfd, stdinfd, stdoutfd,
+		    stderrfd);
+death:
+		destroy_server_sock(gzctlfd, "ctl");
+		destroy_server_sock(gzoutfd, "out");
+		destroy_server_sock(gzerrfd, "err");
+
+		/* when shutting down, leave open until drained */
+		if (!shutting_down) {
+			(void) close(stdinfd);
+			if (mode->zmode_n_stddevs == 3) {
+				(void) close(stdoutfd);
+				(void) close(stderrfd);
+			}
+		}
+	}
+
+	/*
+	 * Attempt to drain remaining log output from the zone prior to closing
+	 * the file descriptors. This helps ensure that complete logs are
+	 * captured during shutdown.
+	 */
+	flags = fcntl(stdoutfd, F_GETFL, 0);
+	if (fcntl(stdoutfd, F_SETFL, flags | O_NONBLOCK) != -1) {
+		while ((len = read(stdoutfd, ibuf, BUFSIZ)) > 0)
+			wr_log_msg(ibuf, len, 1);
+	}
+	(void) close(stdoutfd);
+
+	if (mode->zmode_n_stddevs > 1) {
+		(void) close(stdinfd);
+		flags = fcntl(stderrfd, F_GETFL, 0);
+		if (fcntl(stderrfd, F_SETFL, flags | O_NONBLOCK) != -1) {
+			while ((len = read(stderrfd, ibuf, BUFSIZ)) > 0)
+				wr_log_msg(ibuf, len, 2);
+		}
+		(void) close(stderrfd);
+	}
+
+
+	(void) close(eventstream[0]);
+	eventstream[0] = -1;
+	(void) close(eventstream[1]);
+	eventstream[1] = -1;
+	if (logfd != -1)
+		(void) close(logfd);
+}
+
+/*
+ * The meaning of the original legacy values for the zlog-mode evolved over
+ * time, to the point where the old names no longer made sense. The current
+ * values are simply positional letters used to indicate various capabilities.
+ * The following table shows the meaning of the mode values, along with the
+ * legacy name which we continue to support for compatability. Any future
+ * capability can add a letter to the left and '-' is implied for existing
+ * strings.
+ *
+ * zlog-mode    gz log - tty - ngz log
+ * ---------    ------   ---   -------
+ * gt- (int)       y      y       n
+ * g-- (log)       y      n       n
+ * gtn (nlint)     y      y       y
+ * g-n (nolog)     y      n       y
+ * -t-             n      y       n
+ * ---             n      n       n
+ *
+ * This function also obtains a maximum log size while it is reading the
+ * zone configuration.
+ */
+static void
+get_mode_logmax(zfd_mode_t *mode)
+{
+	zone_dochandle_t handle;
+	struct zone_attrtab attr;
+
+	bzero(mode, sizeof (zfd_mode_t));
+
+	if ((handle = zonecfg_init_handle()) == NULL)
+		return;
+
+	if (zonecfg_get_handle(zone_name, handle) != Z_OK)
+		goto done;
+
+	if (zonecfg_setattrent(handle) != Z_OK)
+		goto done;
+	while (zonecfg_getattrent(handle, &attr) == Z_OK) {
+		if (strcmp(ZLOG_MODE, attr.zone_attr_name) == 0) {
+			if (strcmp("g--", attr.zone_attr_value) == 0 ||
+			    strncmp("log", attr.zone_attr_value, 3) == 0) {
+				mode->zmode_gzlogging = B_TRUE;
+				mode->zmode_n_stddevs = 3;
+				mode->zmode_n_addl_devs = 0;
+			} else if (strcmp("g-n", attr.zone_attr_value) == 0 ||
+			    strncmp("nolog", attr.zone_attr_value, 5) == 0) {
+				mode->zmode_gzlogging = B_TRUE;
+				mode->zmode_n_stddevs = 3;
+				mode->zmode_n_addl_devs = 2;
+			} else if (strcmp("gt-", attr.zone_attr_value) == 0 ||
+			    strncmp("int", attr.zone_attr_value, 3) == 0) {
+				mode->zmode_gzlogging = B_TRUE;
+				mode->zmode_n_stddevs = 1;
+				mode->zmode_n_addl_devs = 0;
+			} else if (strcmp("gtn", attr.zone_attr_value) == 0 ||
+			    strncmp("nlint", attr.zone_attr_value, 5) == 0) {
+				mode->zmode_gzlogging = B_TRUE;
+				mode->zmode_n_stddevs = 1;
+				mode->zmode_n_addl_devs = 1;
+			} else if (strcmp("-t-", attr.zone_attr_value) == 0) {
+				mode->zmode_gzlogging = B_FALSE;
+				mode->zmode_n_stddevs = 1;
+				mode->zmode_n_addl_devs = 0;
+			} else if (strcmp("---", attr.zone_attr_value) == 0) {
+				mode->zmode_gzlogging = B_FALSE;
+				mode->zmode_n_stddevs = 3;
+				mode->zmode_n_addl_devs = 0;
+			}
+
+		} else if (strcmp(LOG_MAXSZ, attr.zone_attr_name) == 0) {
+			char *p;
+			long lval;
+
+			p = attr.zone_attr_value;
+			lval = strtol(p, &p, 10);
+			if (*p == '\0')
+				log_rot_sz = (size_t)lval;
+		}
+	}
+	(void) zonecfg_endattrent(handle);
+
+done:
+	zonecfg_fini_handle(handle);
+}
+
+void
+create_log_thread(zlog_t *logp, zoneid_t id)
+{
+	int res;
+
+	shutting_down = 0;
+	zlogp = logp;
+
+	get_mode_logmax(&mode);
+	if (mode.zmode_n_stddevs == 0)
+		return;
+
+	if (init_zfd_devs(zlogp, &mode) == -1) {
+		zerror(zlogp, B_FALSE,
+		    "zfd setup: device initialization failed");
+		return;
+	}
+
+	res = thr_create(NULL, 0, (void * (*)(void *))srvr, (void *)&mode, 0,
+	    &logger_tid);
+	if (res != 0) {
+		zerror(zlogp, B_FALSE, "error %d creating logger thread", res);
+		logger_tid = 0;
+	}
+}
+
+void
+destroy_log_thread()
+{
+	if (logger_tid != 0) {
+		int stop = 1;
+
+		shutting_down = 1;
+		/* break out of poll to shutdown */
+		if (eventstream[0] != -1)
+			(void) write(eventstream[0], &stop, sizeof (stop));
+		(void) thr_join(logger_tid, NULL, NULL);
+		logger_tid = 0;
+	}
+
+	(void) destroy_zfd_devs(zlogp);
+}
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c
index e2bbd20640..0a714fda38 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.c
+++ b/usr/src/cmd/zoneadmd/zoneadmd.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -68,6 +69,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
+#include <sys/time.h>
 
 #include <bsm/adt.h>
 #include <bsm/adt_event.h>
@@ -108,6 +110,8 @@
 
 static char *progname;
 char *zone_name;	/* zone which we are managing */
+zone_dochandle_t snap_hndl;	/* handle for snapshot created when ready */
+char zonepath[MAXNAMELEN];
 char pool_name[MAXNAMELEN];
 char default_brand[MAXNAMELEN];
 char brand_name[MAXNAMELEN];
@@ -116,10 +120,11 @@ boolean_t zone_iscluster;
 boolean_t zone_islabeled;
 boolean_t shutdown_in_progress;
 static zoneid_t zone_id;
+static zoneid_t zone_did = 0;
 dladm_handle_t dld_handle = NULL;
 
-static char pre_statechg_hook[2 * MAXPATHLEN];
-static char post_statechg_hook[2 * MAXPATHLEN];
+char pre_statechg_hook[2 * MAXPATHLEN];
+char post_statechg_hook[2 * MAXPATHLEN];
 char query_hook[2 * MAXPATHLEN];
 
 zlog_t logsys;
@@ -141,6 +146,9 @@ boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */
 
 #define	DEFAULT_LOCALE	"C"
 
+#define	RSRC_NET	"net"
+#define	RSRC_DEV	"device"
+
 static const char *
 z_cmd_name(zone_cmd_t zcmd)
 {
@@ -257,34 +265,31 @@ zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...)
 }
 
 /*
- * Emit a warning for any boot arguments which are unrecognized.  Since
- * Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we
+ * Since Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we
  * put the arguments into an argv style array, use getopt to process them,
- * and put the resultant argument string back into outargs.
+ * and put the resultant argument string back into outargs. Non-Solaris brands
+ * may support alternate forms of boot arguments so we must handle that as well.
  *
  * During the filtering, we pull out any arguments which are truly "boot"
  * arguments, leaving only those which are to be passed intact to the
  * progenitor process.  The one we support at the moment is -i, which
  * indicates to the kernel which program should be launched as 'init'.
  *
- * A return of Z_INVAL indicates specifically that the arguments are
- * not valid; this is a non-fatal error.  Except for Z_OK, all other return
- * values are treated as fatal.
+ * Except for Z_OK, all other return values are treated as fatal.
  */
 static int
 filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
-    char *init_file, char *badarg)
+    char *init_file)
 {
 	int argc = 0, argc_save;
 	int i;
-	int err;
+	int err = Z_OK;
 	char *arg, *lasts, **argv = NULL, **argv_save;
 	char zonecfg_args[BOOTARGS_MAX];
 	char scratchargs[BOOTARGS_MAX], *sargs;
 	char c;
 
 	bzero(outargs, BOOTARGS_MAX);
-	bzero(badarg, BOOTARGS_MAX);
 
 	/*
 	 * If the user didn't specify transient boot arguments, check
@@ -292,25 +297,10 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
 	 * and use them if applicable.
 	 */
 	if (inargs == NULL || inargs[0] == '\0')  {
-		zone_dochandle_t handle;
-		if ((handle = zonecfg_init_handle()) == NULL) {
-			zerror(zlogp, B_TRUE,
-			    "getting zone configuration handle");
-			return (Z_BAD_HANDLE);
-		}
-		err = zonecfg_get_snapshot_handle(zone_name, handle);
-		if (err != Z_OK) {
-			zerror(zlogp, B_FALSE,
-			    "invalid configuration snapshot");
-			zonecfg_fini_handle(handle);
-			return (Z_BAD_HANDLE);
-		}
-
 		bzero(zonecfg_args, sizeof (zonecfg_args));
-		(void) zonecfg_get_bootargs(handle, zonecfg_args,
+		(void) zonecfg_get_bootargs(snap_hndl, zonecfg_args,
 		    sizeof (zonecfg_args));
 		inargs = zonecfg_args;
-		zonecfg_fini_handle(handle);
 	}
 
 	if (strlen(inargs) >= BOOTARGS_MAX) {
@@ -347,14 +337,22 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
 	}
 
 	/*
-	 * We preserve compatibility with the Solaris system boot behavior,
+	 * We preserve compatibility with the illumos system boot behavior,
 	 * which allows:
 	 *
 	 * 	# reboot kernel/unix -s -m verbose
 	 *
-	 * In this example, kernel/unix tells the booter what file to
-	 * boot.  We don't want reboot in a zone to be gratuitously different,
-	 * so we silently ignore the boot file, if necessary.
+	 * In this example, kernel/unix tells the booter what file to boot. The
+	 * original intent of this was that we didn't want reboot in a zone to
+	 * be gratuitously different, so we would silently ignore the boot
+	 * file, if necessary. However, this usage is archaic and has never
+	 * been common, since it is impossible to boot a zone onto a different
+	 * kernel. Ignoring the first argument breaks for non-native brands
+	 * which pass boot arguments in a different style. e.g.
+	 *	systemd.log_level=debug
+	 * Thus, for backward compatibility we only ignore the first argument
+	 * if it appears to be in the illumos form and attempting to specify a
+	 * kernel.
 	 */
 	if (argv[0] == NULL)
 		goto done;
@@ -362,7 +360,7 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
 	assert(argv[0][0] != ' ');
 	assert(argv[0][0] != '\t');
 
-	if (argv[0][0] != '-' && argv[0][0] != '\0') {
+	if (strncmp(argv[0], "kernel/", 7) == 0) {
 		argv = &argv[1];
 		argc--;
 	}
@@ -390,36 +388,29 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
 			break;
 		case '?':
 			/*
-			 * We warn about unknown arguments but pass them
-			 * along anyway-- if someone wants to develop their
-			 * own init replacement, they can pass it whatever
-			 * args they want.
+			 * If a brand has its own init, we need to pass along
+			 * whatever the user provides. We use the entire
+			 * unknown string here so that we correctly handle
+			 * unknown long options (e.g. --debug).
 			 */
-			err = Z_INVAL;
 			(void) snprintf(outargs, BOOTARGS_MAX,
-			    "%s -%c", outargs, optopt);
-			(void) snprintf(badarg, BOOTARGS_MAX,
-			    "%s -%c", badarg, optopt);
+			    "%s %s", outargs, argv[optind - 1]);
 			break;
 		}
 	}
 
 	/*
-	 * For Solaris Zones we warn about and discard non-option arguments.
-	 * Hence 'boot foo bar baz gub' --> 'boot'.  However, to be similar
-	 * to the kernel, we concat up all the other remaining boot args.
-	 * and warn on them as a group.
+	 * We need to pass along everything else since we don't know what
+	 * the brand's init is expecting. For example, an argument list like:
+	 *   --confdir /foo --debug
+	 * will cause the getopt parsing to stop at '/foo' but we need to pass
+	 * that on, along with the '--debug'. This does mean that we require
+	 * any of our known options (-ifms) to preceed the brand-specific ones.
 	 */
-	if (optind < argc) {
-		err = Z_INVAL;
-		while (optind < argc) {
-			(void) snprintf(badarg, BOOTARGS_MAX, "%s%s%s",
-			    badarg, strlen(badarg) > 0 ? " " : "",
-			    argv[optind]);
-			optind++;
-		}
-		zerror(zlogp, B_FALSE, "WARNING: Unused or invalid boot "
-		    "arguments `%s'.", badarg);
+	while (optind < argc) {
+		(void) snprintf(outargs, BOOTARGS_MAX, "%s %s", outargs,
+		    argv[optind]);
+		optind++;
 	}
 
 done:
@@ -458,7 +449,7 @@ mkzonedir(zlog_t *zlogp)
  * Run the brand's pre-state change callback, if it exists.
  */
 static int
-brand_prestatechg(zlog_t *zlogp, int state, int cmd)
+brand_prestatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
 {
 	char cmdbuf[2 * MAXPATHLEN];
 	const char *altroot;
@@ -471,7 +462,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd)
 	    state, cmd, altroot) > sizeof (cmdbuf))
 		return (-1);
 
-	if (do_subproc(zlogp, cmdbuf, NULL) != 0)
+	if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
 		return (-1);
 
 	return (0);
@@ -481,7 +472,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd)
  * Run the brand's post-state change callback, if it exists.
  */
 static int
-brand_poststatechg(zlog_t *zlogp, int state, int cmd)
+brand_poststatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
 {
 	char cmdbuf[2 * MAXPATHLEN];
 	const char *altroot;
@@ -494,7 +485,7 @@ brand_poststatechg(zlog_t *zlogp, int state, int cmd)
 	    state, cmd, altroot) > sizeof (cmdbuf))
 		return (-1);
 
-	if (do_subproc(zlogp, cmdbuf, NULL) != 0)
+	if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
 		return (-1);
 
 	return (0);
@@ -533,35 +524,44 @@ notify_zonestatd(zoneid_t zoneid)
  * subcommand.
  */
 static int
-zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate)
+zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate, boolean_t debug)
 {
 	int err;
+	boolean_t snapped = B_FALSE;
 
-	if (brand_prestatechg(zlogp, zstate, Z_READY) != 0)
-		return (-1);
-
+	if ((snap_hndl = zonecfg_init_handle()) == NULL) {
+		zerror(zlogp, B_TRUE, "getting zone configuration handle");
+		goto bad;
+	}
 	if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
 		zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
 		    zonecfg_strerror(err));
 		goto bad;
 	}
+	snapped = B_TRUE;
 
-	if ((zone_id = vplat_create(zlogp, mount_cmd)) == -1) {
-		if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
-			zerror(zlogp, B_FALSE, "destroying snapshot: %s",
-			    zonecfg_strerror(err));
+	if (zonecfg_get_snapshot_handle(zone_name, snap_hndl) != Z_OK) {
+		zerror(zlogp, B_FALSE, "invalid configuration snapshot");
 		goto bad;
 	}
+
+	if (zone_did == 0)
+		zone_did = zone_get_did(zone_name);
+
+	if (brand_prestatechg(zlogp, zstate, Z_READY, debug) != 0)
+		goto bad;
+
+	if ((zone_id = vplat_create(zlogp, mount_cmd, zone_did)) == -1)
+		goto bad;
+
 	if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
 		bringup_failure_recovery = B_TRUE;
-		(void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE);
-		if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
-			zerror(zlogp, B_FALSE, "destroying snapshot: %s",
-			    zonecfg_strerror(err));
+		(void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE,
+		    debug);
 		goto bad;
 	}
 
-	if (brand_poststatechg(zlogp, zstate, Z_READY) != 0)
+	if (brand_poststatechg(zlogp, zstate, Z_READY, debug) != 0)
 		goto bad;
 
 	return (0);
@@ -571,7 +571,13 @@ bad:
 	 * If something goes wrong, we up the zones's state to the target
 	 * state, READY, and then invoke the hook as if we're halting.
 	 */
-	(void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT);
+	(void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT, debug);
+	if (snapped)
+		if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
+			zerror(zlogp, B_FALSE, "destroying snapshot: %s",
+			    zonecfg_strerror(err));
+	zonecfg_fini_handle(snap_hndl);
+	snap_hndl = NULL;
 	return (-1);
 }
 
@@ -623,15 +629,8 @@ mount_early_fs(void *data, const char *spec, const char *dir,
 
 	/* determine the zone rootpath */
 	if (mount_cmd) {
-		char zonepath[MAXPATHLEN];
 		char luroot[MAXPATHLEN];
 
-		if (zone_get_zonepath(zone_name,
-		    zonepath, sizeof (zonepath)) != Z_OK) {
-			zerror(zlogp, B_FALSE, "unable to determine zone path");
-			return (-1);
-		}
-
 		(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
 		resolve_lofs(zlogp, luroot, sizeof (luroot));
 		(void) strlcpy(rootpath, luroot, sizeof (rootpath));
@@ -686,6 +685,8 @@ mount_early_fs(void *data, const char *spec, const char *dir,
 		char opt_buf[MAX_MNTOPT_STR];
 		int optlen = 0;
 		int mflag = MS_DATA;
+		int i;
+		int ret;
 
 		(void) ct_tmpl_clear(tmpl_fd);
 		/*
@@ -713,9 +714,26 @@ mount_early_fs(void *data, const char *spec, const char *dir,
 			optlen = MAX_MNTOPT_STR;
 			mflag = MS_OPTIONSTR;
 		}
-		if (mount(spec, dir, mflag, fstype, NULL, 0, opt, optlen) != 0)
-			_exit(errno);
-		_exit(0);
+
+		/*
+		 * There is an obscure race condition which can cause mount
+		 * to return EBUSY. This happens for example on the mount
+		 * of the zone's /etc/svc/volatile file system if there is
+		 * a GZ process running svcs -Z, which will touch the
+		 * mountpoint, just as we're trying to do the mount. To cope
+		 * with this, we retry up to 3 times to let this transient
+		 * process get out of the way.
+		 */
+		for (i = 0; i < 3; i++) {
+			ret = 0;
+			if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
+			    optlen) != 0)
+				ret = errno;
+			if (ret != EBUSY)
+				break;
+			(void) sleep(1);
+		}
+		_exit(ret);
 	}
 
 	/* parent */
@@ -739,12 +757,151 @@ mount_early_fs(void *data, const char *spec, const char *dir,
 }
 
 /*
+ * env variable name format
+ *	_ZONECFG_{resource name}_{identifying attr. name}_{property name}
+ * Any dashes (-) in the property names are replaced with underscore (_).
+ */
+static void
+set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
+{
+	char *p;
+	/* Enough for maximal name, rsrc + attr, & slop for ZONECFG & _'s */
+	char nm[2 * MAXNAMELEN + 32];
+
+	if (attr == NULL)
+		(void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
+		    name);
+	else
+		(void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
+		    attr, name);
+
+	p = nm;
+	while ((p = strchr(p, '-')) != NULL)
+		*p++ = '_';
+
+	(void) setenv(nm, val, 1);
+}
+
+/*
+ * Export zonecfg network and device properties into environment for the boot
+ * and state change hooks.
+ * If debug is true, export the brand hook debug env. variable as well.
+ *
+ * We could export more of the config in the future, as necessary.
+ */
+static int
+setup_subproc_env(boolean_t debug)
+{
+	int res;
+	struct zone_nwiftab ntab;
+	struct zone_devtab dtab;
+	struct zone_attrtab atab;
+	char net_resources[MAXNAMELEN * 2];
+	char dev_resources[MAXNAMELEN * 2];
+
+	/* snap_hndl is null when called through the set_brand_env code path */
+	if (snap_hndl == NULL)
+		return (Z_OK);
+
+	net_resources[0] = '\0';
+	if ((res = zonecfg_setnwifent(snap_hndl)) != Z_OK)
+		goto done;
+
+	while (zonecfg_getnwifent(snap_hndl, &ntab) == Z_OK) {
+		struct zone_res_attrtab *rap;
+		char *phys;
+
+		phys = ntab.zone_nwif_physical;
+
+		(void) strlcat(net_resources, phys, sizeof (net_resources));
+		(void) strlcat(net_resources, " ", sizeof (net_resources));
+
+		set_zonecfg_env(RSRC_NET, phys, "physical", phys);
+
+		set_zonecfg_env(RSRC_NET, phys, "address",
+		    ntab.zone_nwif_address);
+		set_zonecfg_env(RSRC_NET, phys, "allowed-address",
+		    ntab.zone_nwif_allowed_address);
+		set_zonecfg_env(RSRC_NET, phys, "defrouter",
+		    ntab.zone_nwif_defrouter);
+		set_zonecfg_env(RSRC_NET, phys, "global-nic",
+		    ntab.zone_nwif_gnic);
+		set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
+		set_zonecfg_env(RSRC_NET, phys, "vlan-id",
+		    ntab.zone_nwif_vlan_id);
+
+		for (rap = ntab.zone_nwif_attrp; rap != NULL;
+		    rap = rap->zone_res_attr_next)
+			set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
+			    rap->zone_res_attr_value);
+		nwifent_free_attrs(&ntab);
+	}
+
+	(void) setenv("_ZONECFG_net_resources", net_resources, 1);
+
+	(void) zonecfg_endnwifent(snap_hndl);
+
+	if ((res = zonecfg_setdevent(snap_hndl)) != Z_OK)
+		goto done;
+
+	while (zonecfg_getdevent(snap_hndl, &dtab) == Z_OK) {
+		struct zone_res_attrtab *rap;
+		char *match;
+
+		match = dtab.zone_dev_match;
+
+		(void) strlcat(dev_resources, match, sizeof (dev_resources));
+		(void) strlcat(dev_resources, " ", sizeof (dev_resources));
+
+		for (rap = dtab.zone_dev_attrp; rap != NULL;
+		    rap = rap->zone_res_attr_next)
+			set_zonecfg_env(RSRC_DEV, match,
+			    rap->zone_res_attr_name, rap->zone_res_attr_value);
+	}
+
+	(void) zonecfg_enddevent(snap_hndl);
+
+	if ((res = zonecfg_setattrent(snap_hndl)) != Z_OK)
+		goto done;
+
+	while (zonecfg_getattrent(snap_hndl, &atab) == Z_OK) {
+		set_zonecfg_env("attr", NULL, atab.zone_attr_name,
+		    atab.zone_attr_value);
+	}
+
+	(void) zonecfg_endattrent(snap_hndl);
+
+	if (debug)
+		(void) setenv("_ZONEADMD_brand_debug", "1", 1);
+	else
+		(void) setenv("_ZONEADMD_brand_debug", "", 1);
+
+	res = Z_OK;
+
+done:
+	return (res);
+}
+
+void
+nwifent_free_attrs(struct zone_nwiftab *np)
+{
+	struct zone_res_attrtab *rap;
+
+	for (rap = np->zone_nwif_attrp; rap != NULL; ) {
+		struct zone_res_attrtab *tp = rap;
+
+		rap = rap->zone_res_attr_next;
+		free(tp);
+	}
+}
+
+/*
  * If retstr is not NULL, the output of the subproc is returned in the str,
  * otherwise it is output using zerror().  Any memory allocated for retstr
  * should be freed by the caller.
  */
 int
-do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
+do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr, boolean_t debug)
 {
 	char buf[1024];		/* arbitrary large amount */
 	char *inbuf;
@@ -763,6 +920,11 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
 		inbuf = buf;
 	}
 
+	if (setup_subproc_env(debug) != Z_OK) {
+		zerror(zlogp, B_FALSE, "failed to setup environment");
+		return (-1);
+	}
+
 	file = popen(cmdbuf, "r");
 	if (file == NULL) {
 		zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf);
@@ -771,8 +933,13 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
 
 	while (fgets(inbuf, 1024, file) != NULL) {
 		if (retstr == NULL) {
-			if (zlogp != &logsys)
+			if (zlogp != &logsys) {
+				int last = strlen(inbuf) - 1;
+
+				if (inbuf[last] == '\n')
+					inbuf[last] = '\0';
 				zerror(zlogp, B_FALSE, "%s", inbuf);
+			}
 		} else {
 			char *p;
 
@@ -802,24 +969,91 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
 	return (WEXITSTATUS(status));
 }
 
+/*
+ * Get the path for this zone's init(1M) (or equivalent) process. First look
+ * for a zone-specific init-name attr, then get it from the brand.
+ */
 static int
-zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
+get_initname(brand_handle_t bh, char *initname, int len)
+{
+	struct zone_attrtab a;
+
+	bzero(&a, sizeof (a));
+	(void) strlcpy(a.zone_attr_name, "init-name",
+	    sizeof (a.zone_attr_name));
+
+	if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
+		(void) strlcpy(initname, a.zone_attr_value, len);
+		return (0);
+	}
+
+	return (brand_get_initname(bh, initname, len));
+}
+
+/*
+ * Get the restart-init flag for this zone's init(1M) (or equivalent) process.
+ * First look for a zone-specific restart-init attr, then get it from the brand.
+ */
+static boolean_t
+restartinit(brand_handle_t bh)
+{
+	struct zone_attrtab a;
+
+	bzero(&a, sizeof (a));
+	(void) strlcpy(a.zone_attr_name, "restart-init",
+	    sizeof (a.zone_attr_name));
+
+	if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
+		if (strcmp(a.zone_attr_value, "false") == 0)
+			return (B_FALSE);
+		return (B_TRUE);
+	}
+
+	return (brand_restartinit(bh));
+}
+
+/*
+ * Get the app-svc-dependent flag for this zone's init process. This is a
+ * zone-specific attr which controls the type of contract we create for the
+ * zone's init. When true, the contract will include CT_PR_EV_EXIT in the fatal
+ * set, so that when any service which is in the same contract exits, the init
+ * application will be terminated.
+ */
+static boolean_t
+is_app_svc_dep(brand_handle_t bh)
+{
+	struct zone_attrtab a;
+
+	bzero(&a, sizeof (a));
+	(void) strlcpy(a.zone_attr_name, "app-svc-dependent",
+	    sizeof (a.zone_attr_name));
+
+	if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK &&
+	    strcmp(a.zone_attr_value, "true") == 0) {
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
 {
 	zoneid_t zoneid;
 	struct stat st;
-	char zpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
+	char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
 	char nbootargs[BOOTARGS_MAX];
 	char cmdbuf[MAXPATHLEN];
 	fs_callback_t cb;
 	brand_handle_t bh;
 	zone_iptype_t iptype;
-	boolean_t links_loaded = B_FALSE;
 	dladm_status_t status;
 	char errmsg[DLADM_STRSIZE];
 	int err;
 	boolean_t restart_init;
+	boolean_t app_svc_dep;
 
-	if (brand_prestatechg(zlogp, zstate, Z_BOOT) != 0)
+	if (brand_prestatechg(zlogp, zstate, Z_BOOT, debug) != 0)
 		return (-1);
 
 	if ((zoneid = getzoneidbyname(zone_name)) == -1) {
@@ -852,13 +1086,8 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 	/*
 	 * Get the brand's boot callback if it exists.
 	 */
-	if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
-		zerror(zlogp, B_FALSE, "unable to determine zone path");
-		brand_close(bh);
-		goto bad;
-	}
 	(void) strcpy(cmdbuf, EXEC_PREFIX);
-	if (brand_get_boot(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+	if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
 	    sizeof (cmdbuf) - EXEC_LEN) != 0) {
 		zerror(zlogp, B_FALSE,
 		    "unable to determine branded zone's boot callback");
@@ -867,41 +1096,49 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 	}
 
 	/* Get the path for this zone's init(1M) (or equivalent) process.  */
-	if (brand_get_initname(bh, init_file, MAXPATHLEN) != 0) {
+	if (get_initname(bh, init_file, MAXPATHLEN) != 0) {
 		zerror(zlogp, B_FALSE,
 		    "unable to determine zone's init(1M) location");
 		brand_close(bh);
 		goto bad;
 	}
 
-	/* See if this zone's brand should restart init if it dies. */
-	restart_init = brand_restartinit(bh);
+	/* See if we should restart init if it dies. */
+	restart_init = restartinit(bh);
+
+	/*
+	 * See if we need to setup contract dependencies between the zone's
+	 * primary application and any of its services.
+	 */
+	app_svc_dep = is_app_svc_dep(bh);
 
 	brand_close(bh);
 
-	err = filter_bootargs(zlogp, bootargs, nbootargs, init_file,
-	    bad_boot_arg);
-	if (err == Z_INVAL)
-		eventstream_write(Z_EVT_ZONE_BADARGS);
-	else if (err != Z_OK)
+	err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
+	if (err != Z_OK)
 		goto bad;
 
 	assert(init_file[0] != '\0');
 
-	/* Try to anticipate possible problems: Make sure init is executable. */
-	if (zone_get_rootpath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
+	/*
+	 * Try to anticipate possible problems: If possible, make sure init is
+	 * executable.
+	 */
+	if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
 		zerror(zlogp, B_FALSE, "unable to determine zone root");
 		goto bad;
 	}
 
-	(void) snprintf(initpath, sizeof (initpath), "%s%s", zpath, init_file);
+	(void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
 
-	if (stat(initpath, &st) == -1) {
+	if (lstat(initpath, &st) == -1) {
 		zerror(zlogp, B_TRUE, "could not stat %s", initpath);
 		goto bad;
 	}
 
-	if ((st.st_mode & S_IXUSR) == 0) {
+	if ((st.st_mode & S_IFMT) == S_IFLNK) {
+		/* symlink, we'll have to wait and resolve when we boot */
+	} else if ((st.st_mode & S_IXUSR) == 0) {
 		zerror(zlogp, B_FALSE, "%s is not executable", initpath);
 		goto bad;
 	}
@@ -919,7 +1156,6 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 			    " %s", dladm_status2str(status, errmsg));
 			goto bad;
 		}
-		links_loaded = B_TRUE;
 	}
 
 	/*
@@ -928,7 +1164,7 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 	 * is booted.
 	 */
 	if ((strlen(cmdbuf) > EXEC_LEN) &&
-	    (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
+	    (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
 		zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
 		goto bad;
 	}
@@ -949,6 +1185,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 		goto bad;
 	}
 
+	if (app_svc_dep && zone_setattr(zoneid, ZONE_ATTR_APP_SVC_CT,
+	    (void *)B_TRUE, sizeof (boolean_t)) == -1) {
+		zerror(zlogp, B_TRUE, "could not set zone app-die");
+		goto bad;
+	}
+
 	/*
 	 * Inform zonestatd of a new zone so that it can install a door for
 	 * the zone to contact it.
@@ -960,9 +1202,15 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
 		goto bad;
 	}
 
-	if (brand_poststatechg(zlogp, zstate, Z_BOOT) != 0)
+	if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0)
 		goto bad;
 
+	/* Startup a thread to perform zfd logging/tty svc for the zone. */
+	create_log_thread(zlogp, zone_id);
+
+	/* Startup a thread to perform memory capping for the zone. */
+	create_mcap_thread(zlogp, zone_id);
+
 	return (0);
 
 bad:
@@ -970,32 +1218,42 @@ bad:
 	 * If something goes wrong, we up the zones's state to the target
 	 * state, RUNNING, and then invoke the hook as if we're halting.
 	 */
-	(void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT);
-	if (links_loaded)
-		(void) dladm_zone_halt(dld_handle, zoneid);
+	(void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT, debug);
+
 	return (-1);
 }
 
 static int
-zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate)
+zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
+    boolean_t debug)
 {
 	int err;
 
-	if (brand_prestatechg(zlogp, zstate, Z_HALT) != 0)
+	if (brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0)
 		return (-1);
 
-	if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) {
+	/* Shutting down, stop the memcap thread */
+	destroy_mcap_thread();
+
+	if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
 		if (!bringup_failure_recovery)
 			zerror(zlogp, B_FALSE, "unable to destroy zone");
+		destroy_log_thread();
 		return (-1);
 	}
 
+	/* Shut down is done, stop the log thread */
+	destroy_log_thread();
+
+	if (brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0)
+		return (-1);
+
 	if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
 		zerror(zlogp, B_FALSE, "destroying snapshot: %s",
 		    zonecfg_strerror(err));
 
-	if (brand_poststatechg(zlogp, zstate, Z_HALT) != 0)
-		return (-1);
+	zonecfg_fini_handle(snap_hndl);
+	snap_hndl = NULL;
 
 	return (0);
 }
@@ -1007,7 +1265,6 @@ zone_graceful_shutdown(zlog_t *zlogp)
 	pid_t child;
 	char cmdbuf[MAXPATHLEN];
 	brand_handle_t bh = NULL;
-	char zpath[MAXPATHLEN];
 	ctid_t ct;
 	int tmpl_fd;
 	int child_status;
@@ -1028,18 +1285,12 @@ zone_graceful_shutdown(zlog_t *zlogp)
 		return (-1);
 	}
 
-	if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
-		zerror(zlogp, B_FALSE, "unable to determine zone path");
-		brand_close(bh);
-		return (-1);
-	}
-
 	/*
 	 * If there is a brand 'shutdown' callback, execute it now to give the
 	 * brand a chance to cleanup any custom configuration.
 	 */
 	(void) strcpy(cmdbuf, EXEC_PREFIX);
-	if (brand_get_shutdown(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+	if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
 	    sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
 		(void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
 	}
@@ -1177,6 +1428,36 @@ audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
 }
 
 /*
+ * Log the exit time and status of the zone's init process into
+ * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
+ * be -1, otherwise it will be the exit status as described in wait.3c.
+ * If the zone is configured to restart init, then nothing will be logged if
+ * init exits unexpectedly (the kernel will never upcall in this case).
+ */
+static void
+log_init_exit(int status)
+{
+	char p[MAXPATHLEN];
+	char buf[128];
+	struct timeval t;
+	int fd;
+
+	if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
+		return;
+	if (gettimeofday(&t, NULL) != 0)
+		return;
+	if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
+	    status) > sizeof (buf))
+		return;
+	if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
+		return;
+
+	(void) write(fd, buf, strlen(buf));
+
+	(void) close(fd);
+}
+
+/*
  * The main routine for the door server that deals with zone state transitions.
  */
 /* ARGSUSED */
@@ -1189,9 +1470,11 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 
 	zone_state_t zstate;
 	zone_cmd_t cmd;
+	boolean_t debug;
+	int init_status;
 	zone_cmd_arg_t *zargp;
 
-	boolean_t kernelcall;
+	boolean_t kernelcall = B_TRUE;
 
 	int rval = -1;
 	uint64_t uniqid;
@@ -1241,6 +1524,8 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 		goto out;
 	}
 	cmd = zargp->cmd;
+	debug = zargp->debug;
+	init_status = zargp->status;
 
 	if (door_ucred(&uc) != 0) {
 		zerror(&logsys, B_TRUE, "door_ucred");
@@ -1347,23 +1632,25 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 	case ZONE_STATE_INSTALLED:
 		switch (cmd) {
 		case Z_READY:
-			rval = zone_ready(zlogp, Z_MNT_BOOT, zstate);
+			rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, debug);
 			if (rval == 0)
 				eventstream_write(Z_EVT_ZONE_READIED);
+			zcons_statechanged();
 			break;
 		case Z_BOOT:
 		case Z_FORCEBOOT:
 			eventstream_write(Z_EVT_ZONE_BOOTING);
-			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
-			    == 0) {
+			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+			    debug)) == 0) {
 				rval = zone_bootup(zlogp, zargp->bootbuf,
-				    zstate);
+				    zstate, debug);
 			}
 			audit_put_record(zlogp, uc, rval, "boot");
+			zcons_statechanged();
 			if (rval != 0) {
 				bringup_failure_recovery = B_TRUE;
 				(void) zone_halt(zlogp, B_FALSE, B_FALSE,
-				    zstate);
+				    zstate, debug);
 				eventstream_write(Z_EVT_ZONE_BOOTFAILED);
 			}
 			break;
@@ -1415,7 +1702,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 
 			rval = zone_ready(zlogp,
 			    strcmp(zargp->bootbuf, "-U") == 0 ?
-			    Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate);
+			    Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate, debug);
 			if (rval != 0)
 				break;
 
@@ -1477,15 +1764,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 			rval = 0;
 			break;
 		case Z_BOOT:
+		case Z_FORCEBOOT:
 			(void) strlcpy(boot_args, zargp->bootbuf,
 			    sizeof (boot_args));
 			eventstream_write(Z_EVT_ZONE_BOOTING);
-			rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
+			rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
+			    debug);
 			audit_put_record(zlogp, uc, rval, "boot");
+			zcons_statechanged();
 			if (rval != 0) {
 				bringup_failure_recovery = B_TRUE;
 				(void) zone_halt(zlogp, B_FALSE, B_TRUE,
-				    zstate);
+				    zstate, debug);
 				eventstream_write(Z_EVT_ZONE_BOOTFAILED);
 			}
 			boot_args[0] = '\0';
@@ -1493,15 +1783,17 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 		case Z_HALT:
 			if (kernelcall)	/* Invalid; can't happen */
 				abort();
-			if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
-			    != 0)
+			if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
+			    debug)) != 0)
 				break;
+			zcons_statechanged();
 			eventstream_write(Z_EVT_ZONE_HALTED);
 			break;
 		case Z_SHUTDOWN:
 		case Z_REBOOT:
 		case Z_NOTE_UNINSTALLING:
 		case Z_MOUNT:
+		case Z_FORCEMOUNT:
 		case Z_UNMOUNT:
 			if (kernelcall)	/* Invalid; can't happen */
 				abort();
@@ -1518,7 +1810,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 		case Z_UNMOUNT:
 			if (kernelcall)	/* Invalid; can't happen */
 				abort();
-			rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate);
+			rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate, debug);
 			if (rval == 0) {
 				eventstream_write(Z_EVT_ZONE_HALTED);
 				(void) sema_post(&scratch_sem);
@@ -1540,15 +1832,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 	case ZONE_STATE_DOWN:
 		switch (cmd) {
 		case Z_READY:
-			if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
-			    != 0)
+			if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
+			    debug)) != 0)
 				break;
-			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) == 0)
+			zcons_statechanged();
+			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+			    debug)) == 0)
 				eventstream_write(Z_EVT_ZONE_READIED);
 			else
 				eventstream_write(Z_EVT_ZONE_HALTED);
 			break;
 		case Z_BOOT:
+		case Z_FORCEBOOT:
 			/*
 			 * We could have two clients racing to boot this
 			 * zone; the second client loses, but his request
@@ -1559,32 +1854,40 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 			rval = 0;
 			break;
 		case Z_HALT:
-			if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
-			    != 0)
+			if (kernelcall) {
+				log_init_exit(init_status);
+			} else {
+				log_init_exit(-1);
+			}
+			if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
+			    debug)) != 0)
 				break;
 			eventstream_write(Z_EVT_ZONE_HALTED);
+			zcons_statechanged();
 			break;
 		case Z_REBOOT:
 			(void) strlcpy(boot_args, zargp->bootbuf,
 			    sizeof (boot_args));
 			eventstream_write(Z_EVT_ZONE_REBOOTING);
-			if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
-			    != 0) {
+			if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
+			    debug)) != 0) {
 				eventstream_write(Z_EVT_ZONE_BOOTFAILED);
 				boot_args[0] = '\0';
 				break;
 			}
-			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
-			    != 0) {
+			zcons_statechanged();
+			if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+			    debug)) != 0) {
 				eventstream_write(Z_EVT_ZONE_BOOTFAILED);
 				boot_args[0] = '\0';
 				break;
 			}
-			rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
+			rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
+			    debug);
 			audit_put_record(zlogp, uc, rval, "reboot");
 			if (rval != 0) {
 				(void) zone_halt(zlogp, B_FALSE, B_TRUE,
-				    zstate);
+				    zstate, debug);
 				eventstream_write(Z_EVT_ZONE_BOOTFAILED);
 			}
 			boot_args[0] = '\0';
@@ -1596,6 +1899,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
 			break;
 		case Z_NOTE_UNINSTALLING:
 		case Z_MOUNT:
+		case Z_FORCEMOUNT:
 		case Z_UNMOUNT:
 			zerror(zlogp, B_FALSE, "%s operation is invalid "
 			    "for zones in state '%s'", z_cmd_name(cmd),
@@ -1759,11 +2063,38 @@ top:
 		 * state.
 		 */
 		if (zstate > ZONE_STATE_INSTALLED) {
+			static zoneid_t zid;
+
 			zerror(zlogp, B_FALSE,
 			    "zone '%s': WARNING: zone is in state '%s', but "
 			    "zoneadmd does not appear to be available; "
 			    "restarted zoneadmd to recover.",
 			    zone_name, zone_state_str(zstate));
+
+			/*
+			 * Startup a thread to perform the zfd logging/tty svc
+			 * and a thread to perform memory capping for the
+			 * zone. zlogp won't be valid for much longer so use
+			 * logsys.
+			 */
+			if ((zid = getzoneidbyname(zone_name)) != -1) {
+				create_log_thread(&logsys, zid);
+				create_mcap_thread(&logsys, zid);
+			}
+
+			/* recover the global configuration snapshot */
+			if (snap_hndl == NULL) {
+				if ((snap_hndl = zonecfg_init_handle())
+				    == NULL ||
+				    zonecfg_create_snapshot(zone_name)
+				    != Z_OK ||
+				    zonecfg_get_snapshot_handle(zone_name,
+				    snap_hndl) != Z_OK) {
+					zerror(zlogp, B_FALSE, "recovering "
+					    "zone configuration handle");
+					goto out;
+				}
+			}
 		}
 
 		(void) fdetach(zone_door_path);
@@ -1777,21 +2108,62 @@ out:
 }
 
 /*
- * Setup the brand's pre and post state change callbacks, as well as the
- * query callback, if any of these exist.
+ * Run the query hook with the 'env' parameter.  It should return a
+ * string of tab-delimited key-value pairs, each of which should be set
+ * in the environment.
+ *
+ * Because the env_vars string values become part of the environment, the
+ * string is static and we don't free it.
+ *
+ * This function is always called before zoneadmd forks and makes itself
+ * exclusive, so it is possible there could more than one instance of zoneadmd
+ * running in parallel at this point. Thus, we have no zonecfg snapshot and
+ * shouldn't take one yet (i.e. snap_hndl is NULL). Thats ok, since we don't
+ * need any zonecfg info to query for a brand-specific env value.
  */
 static int
-brand_callback_init(brand_handle_t bh, char *zone_name)
+set_brand_env(zlog_t *zlogp)
 {
-	char zpath[MAXPATHLEN];
+	int ret = 0;
+	static char *env_vars = NULL;
+	char buf[2 * MAXPATHLEN];
+
+	if (query_hook[0] == '\0' || env_vars != NULL)
+		return (0);
 
-	if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK)
+	if (snprintf(buf, sizeof (buf), "%s env", query_hook) > sizeof (buf))
 		return (-1);
 
+	if (do_subproc(zlogp, buf, &env_vars, B_FALSE) != 0)
+		return (-1);
+
+	if (env_vars != NULL) {
+		char *sp;
+
+		sp = strtok(env_vars, "\t");
+		while (sp != NULL) {
+			if (putenv(sp) != 0) {
+				ret = -1;
+				break;
+			}
+			sp = strtok(NULL, "\t");
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * Setup the brand's pre and post state change callbacks, as well as the
+ * query callback, if any of these exist.
+ */
+static int
+brand_callback_init(brand_handle_t bh, char *zone_name)
+{
 	(void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
 	    sizeof (pre_statechg_hook));
 
-	if (brand_get_prestatechange(bh, zone_name, zpath,
+	if (brand_get_prestatechange(bh, zone_name, zonepath,
 	    pre_statechg_hook + EXEC_LEN,
 	    sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
 		return (-1);
@@ -1802,7 +2174,7 @@ brand_callback_init(brand_handle_t bh, char *zone_name)
 	(void) strlcpy(post_statechg_hook, EXEC_PREFIX,
 	    sizeof (post_statechg_hook));
 
-	if (brand_get_poststatechange(bh, zone_name, zpath,
+	if (brand_get_poststatechange(bh, zone_name, zonepath,
 	    post_statechg_hook + EXEC_LEN,
 	    sizeof (post_statechg_hook) - EXEC_LEN) != 0)
 		return (-1);
@@ -1813,7 +2185,7 @@ brand_callback_init(brand_handle_t bh, char *zone_name)
 	(void) strlcpy(query_hook, EXEC_PREFIX,
 	    sizeof (query_hook));
 
-	if (brand_get_query(bh, zone_name, zpath, query_hook + EXEC_LEN,
+	if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
 	    sizeof (query_hook) - EXEC_LEN) != 0)
 		return (-1);
 
@@ -1941,6 +2313,11 @@ main(int argc, char *argv[])
 		return (1);
 	}
 
+	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
+		zerror(zlogp, B_FALSE, "unable to determine zone path");
+		return (-1);
+	}
+
 	if (zonecfg_default_brand(default_brand,
 	    sizeof (default_brand)) != Z_OK) {
 		zerror(zlogp, B_FALSE, "unable to determine default brand");
@@ -2012,6 +2389,11 @@ main(int argc, char *argv[])
 	}
 	priv_freeset(privset);
 
+	if (set_brand_env(zlogp) != 0) {
+		zerror(zlogp, B_FALSE, "Unable to setup brand's environment");
+		return (1);
+	}
+
 	if (mkzonedir(zlogp) != 0)
 		return (1);
 
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h
index d784a303b3..7e5dcea432 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.h
+++ b/usr/src/cmd/zoneadmd/zoneadmd.h
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2014, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_ZONEADMD_H
@@ -90,17 +91,19 @@ extern mutex_t msglock;
 extern boolean_t in_death_throes;
 extern boolean_t bringup_failure_recovery;
 extern char *zone_name;
+extern char zonepath[MAXNAMELEN];
+extern zone_dochandle_t snap_hndl;
 extern char pool_name[MAXNAMELEN];
 extern char brand_name[MAXNAMELEN];
 extern char default_brand[MAXNAMELEN];
 extern char boot_args[BOOTARGS_MAX];
-extern char bad_boot_arg[BOOTARGS_MAX];
 extern boolean_t zone_isnative;
 extern boolean_t zone_iscluster;
 extern dladm_handle_t dld_handle;
 
 extern void zerror(zlog_t *, boolean_t, const char *, ...);
 extern char *localize_msg(char *locale, const char *msg);
+extern void nwifent_free_attrs(struct zone_nwiftab *);
 
 /*
  * Eventstream interfaces.
@@ -112,8 +115,7 @@ typedef enum {
 	Z_EVT_ZONE_HALTED,
 	Z_EVT_ZONE_READIED,
 	Z_EVT_ZONE_UNINSTALLING,
-	Z_EVT_ZONE_BOOTFAILED,
-	Z_EVT_ZONE_BADARGS
+	Z_EVT_ZONE_BOOTFAILED
 } zone_evt_t;
 
 extern int eventstream_init();
@@ -135,9 +137,9 @@ typedef enum {
 /*
  * Virtual platform interfaces.
  */
-extern zoneid_t vplat_create(zlog_t *, zone_mnt_t);
+extern zoneid_t vplat_create(zlog_t *, zone_mnt_t, zoneid_t);
 extern int vplat_bringup(zlog_t *, zone_mnt_t, zoneid_t);
-extern int vplat_teardown(zlog_t *, boolean_t, boolean_t);
+extern int vplat_teardown(zlog_t *, boolean_t, boolean_t, boolean_t);
 extern int vplat_get_iptype(zlog_t *, zone_iptype_t *);
 
 /*
@@ -154,6 +156,19 @@ extern void resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen);
  */
 extern int init_console(zlog_t *);
 extern void serve_console(zlog_t *);
+extern void zcons_statechanged();
+
+/*
+ * Memory capping thread creation.
+ */
+extern void create_mcap_thread(zlog_t *, zoneid_t);
+extern void destroy_mcap_thread();
+
+/*
+ * Zone FD log thread creation.
+ */
+extern void create_log_thread(zlog_t *, zoneid_t);
+extern void destroy_log_thread();
 
 /*
  * Contract handling.
@@ -163,7 +178,7 @@ extern int init_template(void);
 /*
  * Routine to manage child processes.
  */
-extern int do_subproc(zlog_t *, char *, char **);
+extern int do_subproc(zlog_t *, char *, char **, boolean_t);
 
 #ifdef __cplusplus
 }