72 files changed, 11599 insertions, 467 deletions
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index a426eeaf10..ce08e3697b 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2018, Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -41,8 +42,102 @@
 #include <sys/zone.h>
 #include <sys/dls.h>
 
+static const char *devnet_zpath = "/dev/net/zone/";
 struct vnodeops		*devnet_vnodeops;
 
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+	char *zname = NULL, *dup;
+	zone_t *zone;
+	int duplen;
+	zoneid_t zid;
+
+	/*
+	 * If in a non-global zone, always return it's zid no matter what the
+	 * node is.
+	 */
+	zid = getzoneid();
+	if (zid != GLOBAL_ZONEID)
+		return (zid);
+
+	/*
+	 * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+	 * we're targetting.
+	 */
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+		return (GLOBAL_ZONEID);
+
+	if (dv->sdev_vnode->v_type == VDIR) {
+		zone = zone_find_by_name(dv->sdev_name);
+	} else {
+		/* Non directories have the form /dev/net/zone/%z/%s */
+		dup = strdup(dv->sdev_path);
+		duplen = strlen(dup);
+		zname = strrchr(dup, '/');
+		*zname = '\0';
+		zname--;
+		zname = strrchr(dup, '/');
+		zname++;
+		zone = zone_find_by_name(zname);
+		kmem_free(dup, duplen + 1);
+	}
+	if (zone == NULL)
+		return (GLOBAL_ZONEID);
+	zid = zone->zone_id;
+	zone_rele(zone);
+	return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+	sdev_node_t *dv;
+	struct vattr va;
+	int ret;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	dv = sdev_cache_lookup(ddv, name);
+	if (dv != NULL) {
+		SDEV_SIMPLE_RELE(dv);
+		return (EEXIST);
+	}
+
+	va = *sdev_getdefault_attr(VDIR);
+	gethrestime(&va.va_atime);
+	va.va_mtime = va.va_atime;
+	va.va_ctime = va.va_atime;
+
+	ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+	if (ret != 0)
+		return (ret);
+	SDEV_SIMPLE_RELE(dv);
+	return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, dv->sdev_path) == 0)
+		return (SDEV_VTOR_VALID);
+
+	zonep = zone_find_by_name(dv->sdev_name);
+	if (zonep == NULL)
+		return (SDEV_VTOR_INVALID);
+	zone_rele(zonep);
+	return (SDEV_VTOR_VALID);
+}
+
 /*
  * Check if a net sdev_node is still valid - i.e. it represents a current
  * network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
 
 	ASSERT(dv->sdev_state == SDEV_READY);
 
-	if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+	if (dv->sdev_vnode->v_type == VDIR)
+		return (devnet_dirvalidate(dv));
+
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+		ASSERT(SDEV_IS_GLOBAL(dv));
+		zoneid = devnet_nodetozone(dv);
+	} else {
+		zoneid = getzoneid();
+	}
+
+	if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
 		return (SDEV_VTOR_INVALID);
-	if (SDEV_IS_GLOBAL(dv))
+	if (zoneid == GLOBAL_ZONEID)
 		return (SDEV_VTOR_VALID);
-	zoneid = getzoneid();
 	return (zone_check_datalink(&zoneid, linkid) == 0 ?
 	    SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
 }
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
  * a net entry when the node is not found in the cache.
  */
 static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+    zoneid_t zid)
 {
 	timestruc_t now;
 	dev_t dev;
 	int error;
 
-	if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+	if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
 		sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
 		    "network node: %s\n", nm));
 		return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	struct sdev_node *ddv = VTOSDEV(dvp);
 	struct sdev_node *dv = NULL;
 	dls_dl_handle_t ddh = NULL;
+	zone_t *zone;
 	struct vattr vattr;
 	int nmlen;
 	int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	if (SDEVTOV(ddv)->v_type != VDIR)
 		return (ENOTDIR);
 
+	if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+		return (EPERM);
+
 	/*
 	 * Empty name or ., return node itself.
 	 */
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
 
 	/*
+	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 */
+	if (ddv->sdev_state == SDEV_ZOMBIE)
+		goto failed;
+
+	/*
 	 * directory cache lookup:
 	 */
 	if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 			goto found;
 	}
 
+	if (SDEV_IS_GLOBAL(ddv)) {
+		/*
+		 * Check for /dev/net/zone
+		 */
+		if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+		    ddv->sdev_path) == 0) {
+			(void) devnet_mkdir(ddv, nm);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+
+		/*
+		 * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+		 * its trailing slash.
+		 */
+		if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+			zone = zone_find_by_name(nm);
+			if (zone == NULL)
+				goto failed;
+			(void) devnet_mkdir(ddv, nm);
+			zone_rele(zone);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+	} else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+		goto failed;
+	}
+
 	/*
-	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 * We didn't find what we were looking for. What that is depends a lot
+	 * on what directory we're in.
 	 */
-	if (ddv->sdev_state == SDEV_ZOMBIE)
-		goto failed;
 
-	error = devnet_create_rvp(nm, &vattr, &ddh);
+	error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
 	if (error != 0)
 		goto failed;
 
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
 	if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
 		goto found;
 
-	if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+	if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
 		return (0);
 
 	ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
 	return (0);
 }
 
+/*
+ * Fill in all the entries for the current zone.
+ */
 static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
 {
-	sdev_node_t	*dv, *next;
 	datalink_id_t	linkid;
 
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	if (zid == GLOBAL_ZONEID) {
+		ASSERT(SDEV_IS_GLOBAL(ddv));
+		linkid = DATALINK_INVALID_LINKID;
+		do {
+			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+			if (linkid != DATALINK_INVALID_LINKID)
+				(void) devnet_filldir_datalink(linkid, ddv);
+		} while (linkid != DATALINK_INVALID_LINKID);
+	} else {
+		(void) zone_datalink_walk(zid,  devnet_filldir_datalink, ddv);
+	}
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+	sdev_node_t *ddv = arg;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	(void) devnet_mkdir(ddv, zonep->zone_name);
+	return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, ddv->sdev_path) == 0) {
+		(void) zone_walk(devnet_fillzdir_cb, ddv);
+		return;
+	}
+
+	zonep = zone_find_by_name(ddv->sdev_name);
+	if (zonep == NULL)
+		return;
+	devnet_fillzone(ddv, zonep->zone_id);
+	zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+	int ret;
+	sdev_node_t *dv, *next;
+
 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
 		rw_exit(&ddv->sdev_contents);
 		rw_enter(&ddv->sdev_contents, RW_WRITER);
+		if (ddv->sdev_state == SDEV_ZOMBIE) {
+			rw_exit(&ddv->sdev_contents);
+			return;
+		}
 	}
 
 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv)
 
 		if (SDEVTOV(dv)->v_count > 0)
 			continue;
+
 		SDEV_HOLD(dv);
+
+		/*
+		 * Clean out everything underneath before we remove ourselves.
+		 */
+		if (SDEVTOV(dv)->v_type == VDIR) {
+			ret = sdev_cleandir(dv, NULL, 0);
+			ASSERT(ret == 0);
+		}
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
 		SDEV_RELE(dv);
 	}
 
+	if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+		devnet_fillzdir(ddv);
+		goto done;
+	}
+
 	if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
 		goto done;
 
 	if (SDEV_IS_GLOBAL(ddv)) {
-		linkid = DATALINK_INVALID_LINKID;
-		do {
-			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
-			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
-			if (linkid != DATALINK_INVALID_LINKID)
-				(void) devnet_filldir_datalink(linkid, ddv);
-		} while (linkid != DATALINK_INVALID_LINKID);
+		devnet_fillzone(ddv, GLOBAL_ZONEID);
+		(void) devnet_mkdir(ddv, "zone");
 	} else {
-		(void) zone_datalink_walk(getzoneid(),
-		    devnet_filldir_datalink, ddv);
+		devnet_fillzone(ddv, getzoneid());
 	}
 
 	ddv->sdev_flags &= ~SDEV_BUILD;
-
 done:
 	rw_downgrade(&ddv->sdev_contents);
 }
@@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 
 	ASSERT(sdvp);
 
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
 	if (uiop->uio_offset == 0)
 		devnet_filldir(sdvp);
 
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index 8fe926f6fb..5a00242482 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -894,6 +894,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred,
 		}
 	}
 
+	if (error == 0)
+		i_ddi_di_cache_invalidate();
+
 	return (error);
 }
 
@@ -1218,6 +1221,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva,
 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
 	if (SDEV_IS_GLOBAL(parent))
 		atomic_inc_ulong(&parent->sdev_gdir_gen);
+	i_ddi_di_cache_invalidate();
 
 	/* wake up other threads blocked on looking up this node */
 	mutex_enter(&self->sdev_lookup_lock);
@@ -1290,6 +1294,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
 	if (SDEV_IS_GLOBAL(parent))
 		atomic_inc_ulong(&parent->sdev_gdir_gen);
+	i_ddi_di_cache_invalidate();
 
 	/* wake up other threads blocked on looking up this node */
 	mutex_enter(&self->sdev_lookup_lock);
@@ -1405,6 +1410,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
 
 	}
 
+	if (error == 0)
+		i_ddi_di_cache_invalidate();
+
 	return (error);
 }
 
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index 8f22ef32f0..e236eb3f72 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -472,8 +472,10 @@ devzvol_create_pool_dirs(struct vnode *dvp)
 		ASSERT(dvp->v_count > 0);
 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 		    NULL, kcred, NULL, 0, NULL);
-		/* should either work, or not be visible from a zone */
-		ASSERT(rc == 0 || rc == ENOENT);
+		/*
+		 * should either work or we should get an error if this should
+		 * not be visible from the zone, or disallowed in the zone
+		 */
 		if (rc == 0)
 			VN_RELE(vp);
 		pools++;
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index 9f7f284842..769316bb4c 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/atomic.h>
 #include <sys/kmem.h>
@@ -33,11 +37,12 @@
 #include <sys/systm.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
-
 #include <sys/fem.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/vfs_opreg.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
 
 #define	NNODES_DEFAULT	8	/* Default number of nodes in a fem_list */
 /*
@@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1)
 }
 #endif
 
+/*
+ * File event monitoring handoffs
+ *
+ * File event monitoring relies on being able to inject stack frames between
+ * vnode consumers and the underlying file systems.  This becomes problematic
+ * when there exist many monitors, as kernel stack depth is finite.  The model
+ * very much encodes this injected frame:  the flow of control deliberately
+ * lies with the monitor, not with the monitoring system.  While we could
+ * conceivably address this by allowing each subsystem to install at most
+ * one monitor per vnode (and impose on subsystems that they handle any
+ * of their own consumer multiplexing internally), this in fact exports a
+ * substantial amount of run-time complexity to deal with an uncommon case
+ * (and, it must be said, assumes a small number of consuming subsystems).
+ * To allow our abstraction to remain clean, we instead check our remaining
+ * stack in every vnext_*() call; if the amount of stack remaining is lower
+ * than a threshold (fem_stack_needed), we call thread_splitstack() to carry
+ * on the execution of the monitors and the underlying vnode operation on a
+ * split stack.  Because we can only pass a single argument to our split stack
+ * function, we must marshal our arguments, the mechanics of which are somewhat
+ * ornate in terms of the code: to marshal in a type-safe manner, we define a
+ * baton that is a union of payload structures for each kind of operation,
+ * loading the per-operation payload explicitly and calling into common handoff
+ * code that itself calls thread_splitstack().  The function passed to
+ * thread_splitstack() is a per-entry point function that continues monitor
+ * processing given the specified (marshalled) arguments.  While this method
+ * is a little verbose to implement, it has the advantage of being relatively
+ * robust (that is, broadly type-safe) while imposing minimal burden on each
+ * vnext_*() entry point.
+ *
+ * In terms of the implementation:
+ *
+ * - The FEM_BATON_n macros define the per-entry point baton structures
+ * - The fem_baton_payload_t contains the union of these structures
+ * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point
+ * - The FEM_VNEXTn macros constitute the per-handoff entry point
+ *
+ * Note that we don't use variadic macros -- we define a variant of these
+ * macros for each of our relevant argument counts.  This may seem overly
+ * explicit, but it is deliberate:  the object here is to minimize the
+ * future maintenance burden by minimizing the likelihood of introduced
+ * error --  not to minimize the number of characters in this source file.
+ */
+
+#ifndef STACK_GROWTH_DOWN
+#error Downward stack growth assumed.
+#endif
+
+int fem_stack_toodeep;
+uintptr_t fem_stack_needed = 8 * 1024;
+size_t fem_handoff_stacksize = 128 * 1024;
+
+#define	FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
+	(uintptr_t)curthread->t_stkbase < fem_stack_needed)
+
+#define	FEM_BATON_1(what, t0, l0)					\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+	} fb_##what
+
+#define	FEM_BATON_2(what, t0, l0, t1, l1)				\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+	} fb_##what
+
+#define	FEM_BATON_3(what, t0, l0, t1, l1, t2, l2)			\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+	} fb_##what
+
+#define	FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3)		\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+	} fb_##what
+
+#define	FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4)	\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+	} fb_##what
+
+#define	FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+	} fb_##what
+
+#define	FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+	} fb_##what
+
+#define	FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7, t8, l8) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+		t8 fb_##what##_##l8;					\
+	} fb_##what
+
+typedef union {
+	FEM_BATON_2(open, int, mode, cred_t *, cr);
+	FEM_BATON_4(close, int, flag, int, count,
+	    offset_t, offset, cred_t *, cr);
+	FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_5(ioctl, int, cmd, intptr_t, arg,
+	    int, flag, cred_t *, cr, int *, rvalp);
+	FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr);
+	FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr);
+	FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp,
+	    pathname_t *, pnp, int, flags, vnode_t *, rdir,
+	    cred_t *, cr, int *, direntflags, pathname_t *, realpnp);
+	FEM_BATON_8(create, char *, name, vattr_t *, vap,
+	    vcexcl_t, excl, int, mode, vnode_t **, vpp,
+	    cred_t *, cr, int, flag, vsecattr_t *, vsecp);
+	FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags);
+	FEM_BATON_4(link, vnode_t *, svp, char *, tnm,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp,
+	    char *, tnm, cred_t *, cr, int, flags);
+	FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap,
+	    vnode_t **, vpp, cred_t *, cr, int, flags,
+	    vsecattr_t *, vsecp);
+	FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr,
+	    int *, eofp, int, flags);
+	FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap,
+	    char *, target, cred_t *, cr, int, flags);
+	FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr);
+	FEM_BATON_2(fsync, int, syncflag, cred_t *, cr);
+	FEM_BATON_1(inactive, cred_t *, cr);
+	FEM_BATON_1(fid, fid_t *, fidp);
+	FEM_BATON_1(rwlock, int, write_lock);
+	FEM_BATON_1(rwunlock, int, write_lock);
+	FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp);
+	FEM_BATON_1(cmp, vnode_t *, vp2);
+	FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, struct flk_callback *, flk_cbp,
+	    cred_t *, cr);
+	FEM_BATON_5(space, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, cred_t *, cr);
+	FEM_BATON_1(realvp, vnode_t **, vpp);
+	FEM_BATON_9(getpage, offset_t, off, size_t, len,
+	    uint_t *, protp, struct page **, plarr, size_t, plsz,
+	    struct seg *, seg, caddr_t, addr, enum seg_rw, rw,
+	    cred_t *, cr);
+	FEM_BATON_4(putpage, offset_t, off, size_t, len,
+	    int, flags, cred_t *, cr);
+	FEM_BATON_8(map, offset_t, off, struct as *, as,
+	    caddr_t *, addrp, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(addmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(delmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uint_t, prot,
+	    uint_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_4(poll, short, events, int, anyyet,
+	    short *, reventsp, struct pollhead **, phpp);
+	FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks);
+	FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr);
+	FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off,
+	    size_t, io_len, int, flags, cred_t *, cr);
+	FEM_BATON_2(dumpctl, int, action, offset_t *, blkp);
+	FEM_BATON_4(dispose, struct page *, pp, int, flag,
+	    int, dn, cred_t *, cr);
+	FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr,
+	    int, flag, cred_t *, cr);
+	FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname);
+	FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag,
+	    xuio_t *, xuiop, cred_t *, cr);
+	FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr);
+} fem_baton_payload_t;
+
+typedef struct {
+	fem_baton_payload_t fb_payload;
+	int (*fb_func)();
+	void (*fb_handoff)();
+	int fb_rval;
+} fem_baton_t;
+
+static int
+fem_handoff(fem_baton_t *bp)
+{
+	fem_stack_toodeep++;
+	thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize);
+
+	return (bp->fb_rval);
+}
+
+#define	FEM_VNEXT3_DECL(what, a0, a1, a2)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2);			\
+}
+
+#define	FEM_VNEXT4_DECL(what, a0, a1, a2, a3)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3);			\
+}
+
+#define	FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4);			\
+}
+
+#define	FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5);			\
+}
+
+#define	FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6);			\
+}
+
+#define	FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7);			\
+}
+
+#define	FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9);			\
+}
+
+#define	FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a10);		\
+}
+
+#define	FEM_VNEXT3(what, func, a0, a1, a2)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2))
+
+#define	FEM_VNEXT4(what, func, a0, a1, a2, a3)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3))
+
+#define	FEM_VNEXT5(what, func, a0, a1, a2, a3, a4)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4))
+
+#define	FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5))
+
+#define	FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6))
+
+#define	FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7))
+
+#define	FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9))
+
+#define	FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a10 = a10;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10))
+
 static fem_t *
 fem_alloc()
 {
@@ -2040,10 +2575,60 @@ static struct fs_operation_def fshead_vfs_spec[]  = {
  * 5.  Return by invoking the base operation with the base object.
  *
  * for each classification, there needs to be at least one "next" operation
- * for each "head"operation.
- *
+ * for each "head" operation.  Note that we also use the FEM_VNEXTn_DECL macros
+ * to define the function to run when the stack is split; see the discussion
+ * on "File event monitoring handoffs", above.
  */
 
+FEM_VNEXT4_DECL(open, arg0, mode, cr, ct)
+FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct)
+FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct)
+FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct)
+FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct)
+FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir,
+    cr, ct, direntflags, realpnp)
+FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)
+FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags)
+FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags)
+FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags)
+FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp)
+FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags)
+FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags)
+FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags)
+FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct)
+FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct)
+FEM_VNEXT3_DECL(fid, arg0, fidp, ct)
+FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct)
+FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct)
+FEM_VNEXT3_DECL(cmp, arg0, vp2, ct)
+FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)
+FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct)
+FEM_VNEXT3_DECL(realvp, arg0, vpp, ct)
+FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz,
+    seg, addr, rw, cr, ct)
+FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct)
+FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct)
+FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct)
+FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct)
+FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct)
+FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct)
+FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct)
+FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct)
+FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct)
+FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct)
+
 int
 vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 {
@@ -2055,7 +2640,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_open, femop_open);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, cr, ct));
+	FEM_VNEXT4(open, func, arg0, mode, cr, ct);
 }
 
 int
@@ -2070,7 +2655,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_close, femop_close);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, flag, count, offset, cr, ct));
+	FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct);
 }
 
 int
@@ -2085,7 +2670,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_read, femop_read);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2100,7 +2685,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_write, femop_write);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2115,7 +2700,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct));
+	FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct);
 }
 
 int
@@ -2130,7 +2715,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, oflags, nflags, cr, ct));
+	FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct);
 }
 
 int
@@ -2145,7 +2730,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2160,7 +2745,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2175,7 +2760,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_access, femop_access);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, flags, cr, ct));
+	FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct);
 }
 
 int
@@ -2191,8 +2776,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp,
 	vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct,
-	    direntflags, realpnp));
+	FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct,
+	    direntflags, realpnp);
 }
 
 int
@@ -2208,7 +2793,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
 	vsop_find(vf, &func, int, &arg0, vop_create, femop_create);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp));
+	FEM_VNEXT10(create, func, arg0, name, vap, excl,
+	    mode, vpp, cr, flag, ct, vsecp);
 }
 
 int
@@ -2223,7 +2809,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
 	vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cr, ct, flags));
+	FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags);
 }
 
 int
@@ -2238,7 +2824,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_link, femop_link);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, svp, tnm, cr, ct, flags));
+	FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags);
 }
 
 int
@@ -2253,7 +2839,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags));
+	FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags);
 }
 
 int
@@ -2268,7 +2854,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp,
 	vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp));
+	FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp);
 }
 
 int
@@ -2283,7 +2869,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cdir, cr, ct, flags));
+	FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags);
 }
 
 int
@@ -2298,7 +2884,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
 	vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, eofp, ct, flags));
+	FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags);
 }
 
 int
@@ -2313,7 +2899,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target,
 	vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, linkname, vap, target, cr, ct, flags));
+	FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags);
 }
 
 int
@@ -2327,7 +2913,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, ct));
+	FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct);
 }
 
 int
@@ -2341,7 +2927,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, syncflag, cr, ct));
+	FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct);
 }
 
 void
@@ -2369,7 +2955,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, fidp, ct));
+	FEM_VNEXT3(fid, func, arg0, fidp, ct);
 }
 
 int
@@ -2383,7 +2969,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, write_lock, ct));
+	FEM_VNEXT3(rwlock, func, arg0, write_lock, ct);
 }
 
 void
@@ -2411,7 +2997,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ooff, noffp, ct));
+	FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct);
 }
 
 int
@@ -2425,7 +3011,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vp2, ct));
+	FEM_VNEXT3(cmp, func, arg0, vp2, ct);
 }
 
 int
@@ -2441,7 +3027,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+	FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 }
 
 int
@@ -2456,7 +3042,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_space, femop_space);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct));
+	FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct);
 }
 
 int
@@ -2470,7 +3056,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vpp, ct));
+	FEM_VNEXT3(realvp, func, arg0, vpp, ct);
 }
 
 int
@@ -2486,8 +3072,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp,
 	vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw,
-	    cr, ct));
+	FEM_VNEXT11(getpage, func, arg0, off, len, protp,
+	    plarr, plsz, seg, addr, rw, cr, ct);
 }
 
 int
@@ -2502,7 +3088,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags,
 	vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, flags, cr, ct));
+	FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct);
 }
 
 int
@@ -2518,8 +3104,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp,
 	vsop_find(vf, &func, int, &arg0, vop_map, femop_map);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct);
 }
 
 int
@@ -2535,8 +3121,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2552,8 +3138,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2568,7 +3154,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp,
 	vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, events, anyyet, reventsp, phpp, ct));
+	FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct);
 }
 
 int
@@ -2583,7 +3169,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks,
 	vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, addr, lbdn, dblks, ct));
+	FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct);
 }
 
 int
@@ -2598,7 +3184,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, valp, cr, ct));
+	FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct);
 }
 
 int
@@ -2613,7 +3199,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off,
 	vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct));
+	FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct);
 }
 
 int
@@ -2627,7 +3213,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, action, blkp, ct));
+	FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct);
 }
 
 void
@@ -2657,7 +3243,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2672,7 +3258,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2687,7 +3273,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, shr, flag, cr, ct));
+	FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct);
 }
 
 int
@@ -2702,7 +3288,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
 	vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vnevent, dvp, cname, ct));
+	FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct);
 }
 
 int
@@ -2717,7 +3303,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ioflag, xuiop, cr, ct));
+	FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct);
 }
 
 int
@@ -2731,7 +3317,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, xuiop, cr, ct));
+	FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct);
 }
 
 int
diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c
index 6e56000ffe..a908f91267 100644
--- a/usr/src/uts/common/fs/fifofs/fifosubr.c
+++ b/usr/src/uts/common/fs/fifofs/fifosubr.c
@@ -22,6 +22,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
  */
 
 /*
@@ -61,7 +62,6 @@
 #if FIFODEBUG
 int Fifo_fastmode = 1;		/* pipes/fifos will be opened in fast mode */
 int Fifo_verbose = 0;		/* msg when switching out of fast mode */
-int Fifohiwat = FIFOHIWAT;	/* Modifiable FIFO high water mark */
 #endif
 
 /*
@@ -196,6 +196,7 @@ fnode_constructor(void *buf, void *cdrarg, int kmflags)
 		fnp->fn_dest = fnp;
 		fnp->fn_mp = NULL;
 		fnp->fn_count = 0;
+		fnp->fn_hiwat = FIFOHIWAT;
 		fnp->fn_rsynccnt = 0;
 		fnp->fn_wsynccnt = 0;
 		fnp->fn_wwaitcnt = 0;
@@ -388,11 +389,7 @@ fifoinit(int fstype, char *name)
 	    pipe_constructor, pipe_destructor, NULL,
 	    (void *)(sizeof (fifodata_t)), NULL, 0);
 
-#if FIFODEBUG
-	if (Fifohiwat < FIFOHIWAT)
-		Fifohiwat = FIFOHIWAT;
-#endif /* FIFODEBUG */
-	fifo_strdata.qi_minfo->mi_hiwat = Fifohiwat;
+	fifo_strdata.qi_minfo->mi_hiwat = FIFOHIWAT;
 
 	return (0);
 }
@@ -614,9 +611,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld)
 	/*
 	 * The other end of the pipe is almost closed so
 	 * reject any other open on this end of the pipe
-	 * This only happens with a pipe mounted under namefs
+	 * This normally only happens with a pipe mounted under namefs, but
+	 * we can also see an open via proc/fd, which should still succeed.
+	 * To indicate the proc/fd case the FKLYR flag is passed.
 	 */
-	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) {
+	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) &&
+	    (flag & FKLYR) == 0) {
 		fifo_cleanup(oldvp, flag);
 		cv_broadcast(&fnp->fn_wait_cv);
 		if (!lockheld)
@@ -1161,7 +1161,8 @@ fifo_wakewriter(fifonode_t *fn_dest, fifolock_t *fn_lock)
 	int fn_dflag = fn_dest->fn_flag;
 
 	ASSERT(MUTEX_HELD(&fn_lock->flk_lock));
-	ASSERT(fn_dest->fn_dest->fn_count < Fifohiwat);
+	ASSERT(fn_dest->fn_dest->fn_count < fn_dest->fn_dest->fn_hiwat);
+
 	if ((fn_dflag & FIFOWANTW)) {
 		cv_broadcast(&fn_dest->fn_wait_cv);
 	}
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index c1b4652633..ceec9bd012 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -28,7 +28,7 @@
  */
 
 /*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2017, Joyent, Inc.
  * Copyright (c) 2017 by Delphix. All rights reserved.
  */
 
@@ -104,10 +104,6 @@ static int fifo_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 static int fifo_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 	caller_context_t *);
 
-/* functions local to this file */
-static boolean_t fifo_stayfast_enter(fifonode_t *);
-static void fifo_stayfast_exit(fifonode_t *);
-
 /*
  * Define the data structures external to this file.
  */
@@ -645,7 +641,7 @@ fifo_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *crp,
  *    (3) write-only FIFO with no data
  *    (4) no data and FNDELAY flag is set.
  * Otherwise return
- *	EAGAIN if FNONBLOCK is set and no data to read
+ *	EAGAIN if FNONBLOCK is set and no data to read or FIFORDBLOCK is set
  *	EINTR if signal received while waiting for data
  *
  * While there is no data to read....
@@ -681,7 +677,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
 	 * Check for data on our input queue
 	 */
 
-	while (fnp->fn_count == 0) {
+	while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) {
 		/*
 		 * No data on first attempt and no writer, then EOF
 		 */
@@ -731,6 +727,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
 	}
 
 	ASSERT(fnp->fn_mp != NULL);
+	VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0);
 
 	/* For pipes copy should not bypass cache */
 	uiop->uio_extflg |= UIO_COPY_CACHED;
@@ -772,6 +769,18 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp,
 				    &fn_lock->flk_lock))
 					goto trywake;
 
+				/*
+				 * If another thread snuck in and started to
+				 * consume data using read-blocking out of
+				 * the pipe while we were blocked in the
+				 * cv_wait, then since we have already consumed
+				 * some of the data out of the pipe we need
+				 * to return with a short read.
+				 */
+				if ((fnp->fn_flag & FIFORDBLOCK) != 0) {
+					goto trywake;
+				}
+
 				if (!(fnp->fn_flag & FIFOFAST))
 					goto stream_mode;
 			}
@@ -787,11 +796,11 @@ trywake:
 	/*
 	 * wake up any blocked writers, processes
 	 * sleeping on POLLWRNORM, or processes waiting for SIGPOLL
-	 * Note: checking for fn_count < Fifohiwat emulates
+	 * Note: checking for fn_count < fn_hiwat emulates
 	 * STREAMS functionality when low water mark is 0
 	 */
 	if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) &&
-	    fnp->fn_count < Fifohiwat) {
+	    fnp->fn_count < fn_dest->fn_hiwat) {
 		fifo_wakewriter(fn_dest, fn_lock);
 	}
 	goto done;
@@ -904,7 +913,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp,
 		/*
 		 * check to make sure we are not over high water mark
 		 */
-		while (fn_dest->fn_count >= Fifohiwat) {
+		while (fn_dest->fn_count >= fn_dest->fn_hiwat) {
 			/*
 			 * Indicate that we have gone over high
 			 * water mark
@@ -962,7 +971,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp,
 		 * then we must break the message up into PIPE_BUF
 		 * chunks to stay compliant with STREAMS
 		 */
-		if (uiop->uio_resid + fn_dest->fn_count > Fifohiwat)
+		if (uiop->uio_resid + fn_dest->fn_count > fn_dest->fn_hiwat)
 			size = MIN(uiop->uio_resid, PIPE_BUF);
 		else
 			size = uiop->uio_resid;
@@ -1213,7 +1222,8 @@ fifo_fastioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr,
 		if (arg != 0) {
 			goto turn_fastoff;
 		}
-		*rvalp = (fnp->fn_dest->fn_count < Fifohiwat) ? 1 : 0;
+		*rvalp = (fnp->fn_dest->fn_count < fnp->fn_dest->fn_hiwat) ?
+		    1 : 0;
 		mutex_exit(&fn_lock->flk_lock);
 		return (0);
 
@@ -1827,7 +1837,7 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
 			retevents = POLLHUP;
 	} else if (events & (POLLWRNORM | POLLWRBAND)) {
 		if (events & POLLWRNORM) {
-			if (fn_dest->fn_count < Fifohiwat)
+			if (fn_dest->fn_count < fn_dest->fn_hiwat)
 				retevents = POLLWRNORM;
 			else
 				fnp->fn_flag |= FIFOHIWATW;
@@ -1996,7 +2006,7 @@ fifo_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *crp,
  * the lock.
  * If the fifo switches into stream mode while we are waiting, return failure.
  */
-static boolean_t
+boolean_t
 fifo_stayfast_enter(fifonode_t *fnp)
 {
 	ASSERT(MUTEX_HELD(&fnp->fn_lock->flk_lock));
@@ -2018,7 +2028,7 @@ fifo_stayfast_enter(fifonode_t *fnp)
  *	- threads wanting to turn into stream mode waiting in fifo_fastoff(),
  *	- other writers threads waiting in fifo_stayfast_enter().
  */
-static void
+void
 fifo_stayfast_exit(fifonode_t *fnp)
 {
 	fifonode_t *fn_dest = fnp->fn_dest;
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..cc03f41c8d
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+		vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define	HL_HASH_SIZE	8192		/* must be power of 2 */
+#define	HL_MUTEX_SIZE	64
+
+static hldirent_t	*hl_hashtable[HL_HASH_SIZE];
+static kmutex_t		 hl_hashmutex[HL_MUTEX_SIZE];
+
+#define	HL_HASH_INDEX(a)	((a) & (HL_HASH_SIZE-1))
+#define	HL_MUTEX_INDEX(a)	((a) & (HL_MUTEX_SIZE-1))
+
+#define	HYPRLOFS_HASH(tp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+void
+hyprlofs_hash_init(void)
+{
+	int	ix;
+
+	for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+		mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+	h->hld_hash = hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	h->hld_link = *prevpp;
+	*prevpp = h;
+	mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = h->hld_hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	while (*prevpp != h)
+		prevpp = &(*prevpp)->hld_link;
+	*prevpp = h->hld_link;
+	mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+    hlnode_t **found)
+{
+	hldirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+	hlnode_t	*hnp;
+
+	HYPRLOFS_HASH(parent, name, hash);
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	l = hl_hashtable[HL_HASH_INDEX(hash)];
+	while (l) {
+		if (l->hld_hash == hash && l->hld_parent == parent &&
+		    strcmp(l->hld_name, name) == 0) {
+			/*
+			 * Ensure that the hlnode that we put a hold on is the
+			 * same one that we pass back. Thus the temp. var
+			 * hnp is necessary.
+			 */
+			hnp = l->hld_hlnode;
+			if (hold) {
+				ASSERT(hnp);
+				hlnode_hold(hnp);
+			}
+			if (found)
+				*found = hnp;
+			mutex_exit(hmtx);
+			return (l);
+		} else {
+			l = l->hld_link;
+		}
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+	int error;
+
+	*foundtp = NULL;
+	if (parent->hln_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		hlnode_hold(parent);
+		*foundtp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name. We need the lock
+	 * protecting the hln_dir list so that it doesn't change out from
+	 * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+	 * with a hold on it.
+	 */
+	if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+		ASSERT(*foundtp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+	hlfsmount_t	*hm,
+	hlnode_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	vnode_t		*realvp,	/* real vnode */
+	vattr_t		*va,
+	hlnode_t	**hpp,		/* return hlnode */
+	cred_t		*cr)
+{
+	hldirent_t *hdp;
+	hlnode_t *found = NULL;
+	hlnode_t *hp;
+	int error = 0;
+	char *s;
+
+	/* hln_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	/* Don't allow '/' characters in pathname component */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("hyprlofs_direnter: NULL name");
+
+	/*
+	 * This might be a "dangling detached directory". It could have been
+	 * removed, but a reference to it kept in u_cwd. Don't bother searching
+	 * it, and with any luck the user will get tired of dealing with us and
+	 * cd to some absolute pathway. This is in ufs, too.
+	 */
+	if (dir->hln_nlink == 0) {
+		return (ENOENT);
+	}
+
+	/* Search for the entry.  Return "found" if it exists. */
+	hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+	if (hdp) {
+		ASSERT(found);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (hpp) {
+				*hpp = found;
+				error = EEXIST;
+			} else {
+				hlnode_rele(found);
+			}
+			break;
+		}
+	} else {
+
+		/*
+		 * The entry does not exist. Check write perms in dir to see if
+		 * entry can be created.
+		 */
+		if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+			return (error);
+
+		/* Make new hlnode and directory entry as required. */
+		if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+		    cr)))
+			return (error);
+
+		if ((error = hldiraddentry(dir, hp, name))) {
+			/* Unmake the inode we just made. */
+			rw_enter(&hp->hln_rwlock, RW_WRITER);
+			if ((hp->hln_type) == VDIR) {
+				ASSERT(hdp == NULL);
+				/* cleanup allocs made by hyprlofs_dirinit() */
+				hyprlofs_dirtrunc(hp);
+			}
+			mutex_enter(&hp->hln_tlock);
+			hp->hln_nlink = 0;
+			mutex_exit(&hp->hln_tlock);
+			gethrestime(&hp->hln_ctime);
+			rw_exit(&hp->hln_rwlock);
+			hlnode_rele(hp);
+			hp = NULL;
+		} else if (hpp) {
+			*hpp = hp;
+		} else {
+			hlnode_rele(hp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+    cred_t *cr)
+{
+	hldirent_t *hpdp;
+	int error;
+	size_t namelen;
+	hlnode_t *hnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+	/* return error if removing . or .. */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+		return (error);
+
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+	if (hpdp == NULL) {
+		/*
+		 * If it is gone, some other thread got here first!
+		 * Return error ENOENT.
+		 */
+		return (ENOENT);
+	}
+
+	/*
+	 * If the hlnode in the hldirent changed (shouldn't happen since we
+	 * don't support rename) then original is gone, so return that status
+	 * (same as UFS).
+	 */
+	if (hp != hnp)
+		return (ENOENT);
+
+	hyprlofs_hash_out(hpdp);
+
+	/* Take hpdp out of the directory list. */
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+	if (hpdp->hld_prev) {
+		hpdp->hld_prev->hld_next = hpdp->hld_next;
+	}
+	if (hpdp->hld_next) {
+		hpdp->hld_next->hld_prev = hpdp->hld_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match hpdp, point it at the
+	 * previous dirent.
+	 */
+	if (dir->hln_dir->hld_prev == hpdp) {
+		dir->hln_dir->hld_prev = hpdp->hld_prev;
+	}
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	/* hpdp points to the correct directory entry */
+	namelen = strlen(hpdp->hld_name) + 1;
+
+	kmem_free(hpdp, sizeof (hldirent_t) + namelen);
+	dir->hln_size -= (sizeof (hldirent_t) + namelen);
+	dir->hln_dirents--;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+	hp->hln_ctime = now;
+
+	ASSERT(hp->hln_nlink > 0);
+	DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+	if (op == DR_RMDIR && hp->hln_type == VDIR) {
+		hyprlofs_dirtrunc(hp);
+		ASSERT(hp->hln_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+	hlnode_t *parent,	/* parent of directory to initialize */
+	hlnode_t *dir)		/* the new directory */
+{
+	hldirent_t *dot, *dotdot;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP);
+	dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP);
+
+	/* Initialize the entries */
+	dot->hld_hlnode = dir;
+	dot->hld_offset = 0;
+	dot->hld_name = (char *)dot + sizeof (hldirent_t);
+	dot->hld_name[0] = '.';
+	dot->hld_parent = dir;
+	hyprlofs_hash_in(dot);
+
+	dotdot->hld_hlnode = parent;
+	dotdot->hld_offset = 1;
+	dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+	dotdot->hld_name[0] = '.';
+	dotdot->hld_name[1] = '.';
+	dotdot->hld_parent = dir;
+	hyprlofs_hash_in(dotdot);
+
+	/* Initialize directory entry list. */
+	dot->hld_next = dotdot;
+	dot->hld_prev = dotdot;
+	dotdot->hld_next = NULL;
+	dotdot->hld_prev = dot;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	/*
+	 * Since hyprlofs_dirinit is called with both dir and parent being the
+	 * same for the root vnode, we need to increment this before we set
+	 * hln_nlink = 2 below.
+	 */
+	INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+	parent->hln_ctime = now;
+
+	dir->hln_dir = dot;
+	dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+	dir->hln_dirents = 2;
+	dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+	hldirent_t *hdp;
+	hlnode_t *tp;
+	size_t namelen;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (dir->hln_looped)
+		return;
+
+	for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+		ASSERT(hdp->hld_next != hdp);
+		ASSERT(hdp->hld_prev != hdp);
+		ASSERT(hdp->hld_hlnode);
+
+		dir->hln_dir = hdp->hld_next;
+		namelen = strlen(hdp->hld_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this dir entry removal.
+		 */
+		tp = hdp->hld_hlnode;
+
+		ASSERT(tp->hln_nlink > 0);
+		DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+		hyprlofs_hash_out(hdp);
+
+		kmem_free(hdp, sizeof (hldirent_t) + namelen);
+		dir->hln_size -= (sizeof (hldirent_t) + namelen);
+		dir->hln_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	ASSERT(dir->hln_dir == NULL);
+	ASSERT(dir->hln_size == 0);
+	ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+    hlnode_t	*dir,	/* target directory to make entry in */
+    hlnode_t	*hp,	/* new hlnode */
+    char	*name)
+{
+	hldirent_t	*hdp, *hpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent dir wasn't removed from underneath the caller.
+	 */
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same FS. */
+	if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Alloc and init dir entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (hldirent_t);
+	hdp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY);
+	if (hdp == NULL)
+		return (ENOSPC);
+
+	dir->hln_size += alloc_size;
+	dir->hln_dirents++;
+	hdp->hld_hlnode = hp;
+	hdp->hld_parent = dir;
+
+	/* The dir entry and its name were allocated sequentially. */
+	hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+	(void) strcpy(hdp->hld_name, name);
+
+	hyprlofs_hash_in(hdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain fairly
+	 * static.  For example, a routine which unlinks files between calls to
+	 * readdir(); the size of the dir changes from underneath it and so the
+	 * real dir offset in bytes is invalid.  To circumvent this problem, we
+	 * initialize a dir entry with a phony offset, and use this offset to
+	 * determine end of file in hyprlofs_readdir.
+	 */
+	hpdp = dir->hln_dir->hld_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+	    hpdp->hld_offset) <= 1) {
+		ASSERT(hpdp->hld_next != hpdp);
+		ASSERT(hpdp->hld_prev != hpdp);
+		ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+		hpdp = hpdp->hld_next;
+	}
+	hdp->hld_offset = hpdp->hld_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which is
+	 * necessarily the largest offset in this dir) is more than twice the
+	 * number of dirents, that means the dir is 50% holes.  At this point
+	 * we reset the slot pointer back to the beginning of the dir so we
+	 * start using the holes. The idea is that if there are N dirents,
+	 * there must also be N holes, so we can satisfy the next N creates by
+	 * walking at most 2N entries; thus the average cost of a create is
+	 * constant. Note that we use the first dirent's hld_prev as the roving
+	 * slot pointer. This saves a word in every dirent.
+	 */
+	if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+		dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+	else
+		dir->hln_dir->hld_prev = hdp;
+
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	hdp->hld_next = hpdp->hld_next;
+	if (hdp->hld_next) {
+		hdp->hld_next->hld_prev = hdp;
+	}
+	hdp->hld_prev = hpdp;
+	hpdp->hld_next = hdp;
+
+	ASSERT(hdp->hld_next != hdp);
+	ASSERT(hdp->hld_prev != hdp);
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+    vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+	hlnode_t	*hp;
+	enum vtype	type;
+
+	ASSERT(va != NULL);
+	ASSERT(op == DE_CREATE || op == DE_MKDIR);
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+	type = va->va_type;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, va, cr);
+
+	hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+	hp->hln_vnode->v_type = type;
+	hp->hln_uid = crgetuid(cr);
+
+	/*
+	 * To determine the gid of the created file:
+	 *   If the directory's set-gid bit is set, set the gid to the gid
+	 *   of the parent dir, otherwise, use the process's gid.
+	 */
+	if (dir->hln_mode & VSGID)
+		hp->hln_gid = dir->hln_gid;
+	else
+		hp->hln_gid = crgetgid(cr);
+
+	/*
+	 * If we're creating a dir and the parent dir has the set-GID bit set,
+	 * set it on the new dir. Otherwise, if the user is neither privileged
+	 * nor a member of the file's new group, clear the file's set-GID bit.
+	 */
+	if (dir->hln_mode & VSGID && type == VDIR)
+		hp->hln_mode |= VSGID;
+	else {
+		if ((hp->hln_mode & VSGID) &&
+		    secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+			hp->hln_mode &= ~VSGID;
+	}
+
+	if (va->va_mask & AT_ATIME)
+		hp->hln_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		hp->hln_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		hyprlofs_dirinit(dir, hp);
+		hp->hln_looped = 0;
+	} else {
+		hp->hln_realvp = realvp;
+		hp->hln_size = va->va_size;
+		hp->hln_looped = 1;
+	}
+
+	*newnode = hp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..1d857309f3
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define	MODESHIFT	3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+	vnode_t *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+	h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	h->hln_mask = 0;
+	h->hln_type = vap->va_type;
+	h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+	h->hln_nlink = 1;
+	h->hln_size = 0;
+
+	if (cr == NULL) {
+		h->hln_uid = vap->va_uid;
+		h->hln_gid = vap->va_gid;
+	} else {
+		h->hln_uid = crgetuid(cr);
+		h->hln_gid = crgetgid(cr);
+	}
+
+	h->hln_fsid = hm->hlm_dev;
+	h->hln_rdev = vap->va_rdev;
+	h->hln_blksize = PAGESIZE;
+	h->hln_nblocks = 0;
+	gethrestime(&now);
+	h->hln_atime = now;
+	h->hln_mtime = now;
+	h->hln_ctime = now;
+	h->hln_seq = 0;
+	h->hln_dir = NULL;
+
+	h->hln_vnode = vn_alloc(KM_SLEEP);
+	vp = HLNTOV(h);
+	vn_setops(vp, hyprlofs_vnodeops);
+	vp->v_vfsp = hm->hlm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)h;
+	mutex_enter(&hm->hlm_contents);
+	/*
+	 * Increment the pseudo generation number for this hlnode. Since
+	 * hlnodes are allocated and freed, there really is no particular
+	 * generation number for a new hlnode.  Just fake it by using a
+	 * counter in each file system.
+	 */
+	h->hln_gen = hm->hlm_gen++;
+
+	/*
+	 * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+	 * Root dir is handled specially in hyprlofs_mount.
+	 */
+	if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+		h->hln_forw = NULL;
+		h->hln_back = hm->hlm_rootnode->hln_back;
+		h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+	}
+	mutex_exit(&hm->hlm_contents);
+	vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+	hlnode_t *hp = vtp;
+	int shift = 0;
+
+	/* Check access based on owner, group and public perms in hlnode. */
+	if (crgetuid(cr) != hp->hln_uid) {
+		shift += MODESHIFT;
+		if (groupmember(hp->hln_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+	    hp->hln_mode << shift, mode));
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..bf80da6dbe
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,613 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems.  It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute.  Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses.  Entries in the name space are not mounts and
+ * thus do not appear in the mnttab.  Entries in the name space are allowed to
+ * refer to files on different backing file systems.  Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs.  There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications.  That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place.  It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space.  The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed.  The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /.  The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"hyprlofs",
+	hyprlofsinit,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+	&hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+	0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(hyprlofsfstype);
+	vn_freevnodeops(hyprlofs_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace;	/* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t	hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+	static const fs_operation_def_t hl_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = hyprlofs_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = hyprlofs_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = hyprlofs_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = hyprlofs_statvfs },
+		VFSNAME_VGET,		{ .vfs_vget = hyprlofs_vget },
+		NULL,			NULL
+	};
+	int error;
+	extern  void    hyprlofs_hash_init();
+
+	hyprlofs_hash_init();
+	hyprlofsfstype = fstype;
+	ASSERT(hyprlofsfstype != 0);
+
+	error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, hyprlofs_vnodeops_template,
+	    &hyprlofs_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * hyprlofs_minfree is an absolute limit of swap space which still
+	 * allows other processes to execute.  Set it if its not patched.
+	 */
+	if (hyprlofs_minfree == 0)
+		hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+	if ((hyprlofs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "hyprlofsinit: Can't get unique device number.");
+		hyprlofs_major = 0;
+	}
+	mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	hlfsmount_t *hm = NULL;
+	hlnode_t *hp;
+	struct pathname dpn;
+	int error;
+	vattr_t rattr;
+	int got_attrs;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* Having the resource be anything but "swap" doesn't make sense. */
+	vfs_setresource(vfsp, "swap", 0);
+
+	if ((error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+	    &dpn)) != 0)
+		goto out;
+
+	if ((hm = kmem_zalloc(sizeof (hlfsmount_t), KM_NOSLEEP_LAZY)) == NULL) {
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Get an available minor device number for this mount */
+	mutex_enter(&hyprlofs_minor_lock);
+	do {
+		hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+		hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+	} while (vfs_devismounted(hm->hlm_dev));
+	mutex_exit(&hyprlofs_minor_lock);
+
+	/*
+	 * Set but don't bother entering the mutex since hlfsmount is not on
+	 * the mount list yet.
+	 */
+	mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	hm->hlm_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)hm;
+	vfsp->vfs_fstype = hyprlofsfstype;
+	vfsp->vfs_dev = hm->hlm_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+	hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	(void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root hlnode structure */
+	bzero(&rattr, sizeof (vattr_t));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, &rattr, cr);
+
+	/* Get the mode, uid, and gid from the underlying mount point. */
+	rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+	HLNTOV(hp)->v_flag |= VROOT;
+
+	/*
+	 * If the getattr succeeded, use its results, otherwise allow the
+	 * previously set defaults to prevail.
+	 */
+	if (got_attrs == 0) {
+		hp->hln_mode = rattr.va_mode;
+		hp->hln_uid = rattr.va_uid;
+		hp->hln_gid = rattr.va_gid;
+	}
+
+	/*
+	 * Initialize linked list of hlnodes so that the back pointer of the
+	 * root hlnode always points to the last one on the list and the
+	 * forward pointer of the last node is null
+	 */
+	hp->hln_back = hp;
+	hp->hln_forw = NULL;
+	hp->hln_nlink = 0;
+	hm->hlm_rootnode = hp;
+
+	hyprlofs_dirinit(hp, hp);
+
+	rw_exit(&hp->hln_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+
+out:
+	return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hnp, *cancel;
+	vnode_t	*vp;
+	int error;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	/*
+	 * forced unmount is not supported by this file system
+	 * and thus, ENOTSUP, is being returned.
+	 */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	mutex_enter(&hm->hlm_contents);
+
+	/*
+	 * If there are no open files, only the root node should have a ref cnt.
+	 * With hlm_contents held, nothing can be added or removed. There may
+	 * be some dirty pages.  To prevent fsflush from disrupting the unmount,
+	 * put a hold on each node while scanning. If we find a previously
+	 * referenced node, undo the holds we have placed and fail EBUSY.
+	 */
+	hnp = hm->hlm_rootnode;
+	if (HLNTOV(hnp)->v_count > 1) {
+		mutex_exit(&hm->hlm_contents);
+		return (EBUSY);
+	}
+
+	for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+		if ((vp = HLNTOV(hnp))->v_count > 0) {
+			cancel = hm->hlm_rootnode->hln_forw;
+			while (cancel != hnp) {
+				vp = HLNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->hln_forw;
+			}
+			mutex_exit(&hm->hlm_contents);
+			return (EBUSY);
+		}
+		VN_HOLD(vp);
+	}
+
+	/* We can drop the mutex now because no one can find this mount */
+	mutex_exit(&hm->hlm_contents);
+
+	/*
+	 * Free all alloc'd memory associated with this FS. To do this, we go
+	 * through the file list twice, once to remove all the dir entries, and
+	 * then to remove all the files.
+	 */
+
+	/* Remove all directory entries */
+	for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+		rw_enter(&hnp->hln_rwlock, RW_WRITER);
+		if (hnp->hln_type == VDIR)
+			hyprlofs_dirtrunc(hnp);
+		rw_exit(&hnp->hln_rwlock);
+	}
+
+	ASSERT(hm->hlm_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place. VN_RELE
+	 * should make the node disappear, unless somebody is holding pages
+	 * against it.  Wait and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+	 * from blowing it away (in hyprlofs_inactive) while we're trying to
+	 * get to it here. Once we have a HOLD on it we know it'll stick around.
+	 */
+	mutex_enter(&hm->hlm_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+		mutex_exit(&hm->hlm_contents);
+		/* Note we handled the link count in pass 2 above. */
+		vp = HLNTOV(hnp);
+		VN_RELE(vp);
+		mutex_enter(&hm->hlm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again.
+		 */
+		if (hnp == hm->hlm_rootnode->hln_back) {
+			VN_HOLD(vp);
+			mutex_exit(&hm->hlm_contents);
+			delay(hz / 4);
+			mutex_enter(&hm->hlm_contents);
+		}
+	}
+	mutex_exit(&hm->hlm_contents);
+
+	VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+	ASSERT(hm->hlm_mntpath);
+
+	kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+	mutex_destroy(&hm->hlm_contents);
+	kmem_free(hm, sizeof (hlfsmount_t));
+
+	return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = hm->hlm_rootnode;
+	vnode_t *vp;
+
+	ASSERT(hp);
+
+	vp = HLNTOV(hp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	/*
+	 * The FS may have been mounted by the GZ on behalf of the NGZ.  In
+	 * that case, the hlfsmount zone_id will be the global zone.  We want
+	 * to show the swap cap inside the zone in this case, even though the
+	 * FS was mounted by the GZ.
+	 */
+	if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+		zp = curproc->p_zone;
+	else
+		zp = hm->hlm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > hyprlofs_minfree)
+		sbp->f_bfree = blocks - hyprlofs_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is what's available plus what's been used
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a NGZ with a swap cap, then report the
+		 * capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * This is fairly inaccurate since it doesn't take into account the
+	 * names stored in the directory entries.
+	 */
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+	    (sizeof (hlnode_t) + sizeof (hldirent_t));
+
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+	/*
+	 * ensure null termination
+	 */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+	hlfid_t *hfid;
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = NULL;
+
+	hfid = (hlfid_t *)fidp;
+	*vpp = NULL;
+
+	mutex_enter(&hm->hlm_contents);
+	for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+		mutex_enter(&hp->hln_tlock);
+		if (hp->hln_nodeid == hfid->hlfid_ino) {
+			/*
+			 * If the gen numbers don't match we know the file
+			 * won't be found since only one hlnode can have this
+			 * number at a time.
+			 */
+			if (hp->hln_gen != hfid->hlfid_gen ||
+			    hp->hln_nlink == 0) {
+				mutex_exit(&hp->hln_tlock);
+				mutex_exit(&hm->hlm_contents);
+				return (0);
+			}
+			*vpp = (vnode_t *)HLNTOV(hp);
+
+			VN_HOLD(*vpp);
+
+			if ((hp->hln_mode & S_ISVTX) &&
+			    !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+				mutex_enter(&(*vpp)->v_lock);
+				(*vpp)->v_flag |= VISSWAP;
+				mutex_exit(&(*vpp)->v_lock);
+			}
+			mutex_exit(&hp->hln_tlock);
+			mutex_exit(&hm->hlm_contents);
+			return (0);
+		}
+		mutex_exit(&hp->hln_tlock);
+	}
+	mutex_exit(&hm->hlm_contents);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..52dba31761
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+		caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+		int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call.  This is only used to validate that
+ * the input list looks sane.
+ */
+#define	MAX_IOCTL_PARAMS	100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t *rvp;
+	int error;
+
+	rvp = REALVP(*vpp);
+
+	if (VTOHLN(*vpp)->hln_looped == 0)
+		return (0);
+
+	/*
+	 * looped back, pass through to real vnode. Need to hold new reference
+	 * to vp since VOP_OPEN() may decide to release it.
+	 */
+	VN_HOLD(rvp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+	ASSERT(rvp->v_count > 1);
+	VN_RELE(rvp);
+
+	return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0) {
+		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+		cleanshares(vp, ttoproc(curthread)->p_pid);
+		return (0);
+	}
+
+	return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+	return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* We don't support writing to non-regular files */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	uint_t len, cnt;
+	int i, error;
+	model_t model;
+	char path[MAXPATHLEN];
+	char nm[MAXPATHLEN];
+
+	/* We only support the hyprlofs ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/*
+	 * Check if managing hyprlofs is allowed.
+	 */
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+		model = get_udatamodel();
+
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_entries_t ebuf;
+			hyprlofs_entry_t *e;
+
+			if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+				return (EFAULT);
+			cnt = ebuf.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry_t) * cnt;
+
+			e = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(ebuf.hle_entries), e, len)) {
+				kmem_free(e, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e[i].hle_nlen == 0 ||
+				    e[i].hle_nlen >= sizeof (nm)) {
+					kmem_free(e, len);
+					return (EINVAL);
+				}
+
+				if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+				    != 0) {
+					kmem_free(e, len);
+					return (EFAULT);
+				}
+				nm[e[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e[i].hle_plen == 0 ||
+					    e[i].hle_plen >= sizeof (path)) {
+						kmem_free(e, len);
+						return (EINVAL);
+					}
+
+					if (copyin(e[i].hle_path, path,
+					    e[i].hle_plen) != 0) {
+						kmem_free(e, len);
+						return (EFAULT);
+					}
+					path[e[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e, len);
+			return (0);
+
+		} else {
+			hyprlofs_entries32_t ebuf32;
+			hyprlofs_entry32_t *e32;
+
+			if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+				return (EFAULT);
+
+			cnt = ebuf32.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry32_t) * cnt;
+
+			e32 = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+			    e32, len)) {
+				kmem_free(e32, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e32[i].hle_nlen == 0 ||
+				    e32[i].hle_nlen >= sizeof (nm)) {
+					kmem_free(e32, len);
+					return (EINVAL);
+				}
+
+				if (copyin((void *)(unsigned long)
+				    e32[i].hle_name, nm,
+				    e32[i].hle_nlen) != 0) {
+					kmem_free(e32, len);
+					return (EFAULT);
+				}
+				nm[e32[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e32[i].hle_plen == 0 ||
+					    e32[i].hle_plen >= sizeof (path)) {
+						kmem_free(e32, len);
+						return (EINVAL);
+					}
+
+					if (copyin((void *)(unsigned long)
+					    e32[i].hle_path, path,
+					    e32[i].hle_plen) != 0) {
+						kmem_free(e32, len);
+						return (EFAULT);
+					}
+					path[e32[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e32, len);
+			return (0);
+		}
+	}
+
+	if (cmd == HYPRLOFS_RM_ALL) {
+		return (hyprlofs_rm_all(vp, cr, ct, flag));
+	}
+
+	if (cmd == HYPRLOFS_GET_ENTRIES) {
+		return (hyprlofs_get_all(vp, data, cr, ct, flag));
+	}
+
+	return (ENOTTY);
+}
+
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	vattr_t tmp_va;
+
+	if (tp->hln_looped == 1) {
+		int error;
+
+		if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr,
+		    ct)) != 0)
+			return (error);
+	}
+
+	mutex_enter(&tp->hln_tlock);
+	vap->va_type = vp->v_type;
+	vap->va_mode = tp->hln_mode & MODEMASK;
+	vap->va_uid = tp->hln_uid;
+	vap->va_gid = tp->hln_gid;
+	vap->va_fsid = tp->hln_fsid;
+	vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+	vap->va_nlink = tp->hln_nlink;
+	vap->va_size = (u_offset_t)tp->hln_size;
+	vap->va_atime = tp->hln_atime;
+	vap->va_mtime = tp->hln_mtime;
+	vap->va_ctime = tp->hln_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = tp->hln_rdev;
+	vap->va_seq = tp->hln_seq;
+
+	if (tp->hln_looped == 1) {
+		vap->va_nblocks = tmp_va.va_nblocks;
+	} else {
+		vap->va_nblocks =
+		    (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	}
+	mutex_exit(&tp->hln_tlock);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error = 0;
+	vattr_t *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+		return (EINVAL);
+
+	mutex_enter(&tp->hln_tlock);
+
+	get = &tp->hln_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+	    hyprlofs_taccess, tp);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&tp->hln_ctime);
+
+out:
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error;
+
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+	mutex_enter(&tp->hln_tlock);
+	error = hyprlofs_taccess(tp, mode, cr);
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *ntp = NULL;
+	int error;
+
+	if (VTOHLN(dvp)->hln_looped == 1)
+		return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp));
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(tp);
+
+	if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+		ASSERT(ntp);
+		*vpp = HLNTOV(ntp);
+	}
+	return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+    int mode, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *parent;
+	hlfsmount_t *tm;
+	int error;
+	hlnode_t *oldtp;
+	vnode_t *vp;
+
+	parent = (hlnode_t *)VTOHLN(dvp);
+	tm = (hlfsmount_t *)VTOHLM(dvp);
+	error = 0;
+	oldtp = NULL;
+
+	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+		/* we don't support the sticky bit */
+		vap->va_mode &= ~VSVTX;
+	} else if (vap->va_type == VNON) {
+		return (EINVAL);
+	}
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		oldtp = parent;
+	} else {
+		error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+	}
+
+	if (error == 0) {	/* name found */
+		ASSERT(oldtp);
+
+		rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+		/*
+		 * if create/read-only an existing directory, allow it
+		 */
+		if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+			error = EISDIR;
+		else {
+			error = hyprlofs_taccess(oldtp, mode, cr);
+		}
+
+		if (error) {
+			rw_exit(&oldtp->hln_rwlock);
+			hlnode_rele(oldtp);
+			return (error);
+		}
+
+		vp = HLNTOV(oldtp);
+		rw_exit(&oldtp->hln_rwlock);
+
+		if (vp->v_type == VREG) {
+			hlnode_rele(oldtp);
+			return (EEXIST);
+		}
+
+		vnevent_create(vp, ct);
+		return (0);
+	}
+
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+	    cr);
+	rw_exit(&parent->hln_rwlock);
+
+	return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT return
+	 * from hyprlofs_dirlookup() is a valid return.
+	 */
+	if (parent->hln_nlink == 0)
+		return (ENOENT);
+
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error == 0) {
+		ASSERT(self);
+		hlnode_rele(self);
+		/* We can't loop in under a looped in directory */
+		if (self->hln_looped)
+			return (EACCES);
+		*vpp = HLNTOV(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+	    va, &self, cr);
+	rw_exit(&parent->hln_rwlock);
+
+	if (error == 0 || error == EEXIST) {
+		hlnode_rele(self);
+		*vpp = HLNTOV(self);
+	}
+
+	return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+    cred_t *cr, caller_context_t *ct)
+{
+	int error;
+	char *p, *pnm;
+	vnode_t *realvp, *dvp;
+	vattr_t va;
+
+	/*
+	 * Get vnode for the real file/dir. We'll have a hold on realvp which
+	 * we won't vn_rele until hyprlofs_inactive.
+	 */
+	if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &realvp)) != 0)
+		return (error);
+
+	/* no devices allowed */
+	if (IS_DEVVP(realvp)) {
+		VN_RELE(realvp);
+		return (ENODEV);
+	}
+
+	/*
+	 * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+	 * to trigger the mount of the intended filesystem. This causes a
+	 * loopback mount of the intended filesystem instead of the AUTOFS
+	 * filesystem.
+	 */
+	if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/*
+	 * We're interested in the top most filesystem. This is specially
+	 * important when fspath is a trigger AUTOFS node, since we're really
+	 * interested in mounting the filesystem AUTOFS mounted as result of
+	 * the VOP_ACCESS() call not the AUTOFS node itself.
+	 */
+	if (vn_mountedvfs(realvp) != NULL) {
+		if ((error = traverse(&realvp)) != 0) {
+			VN_RELE(realvp);
+			return (error);
+		}
+	}
+
+	va.va_type = VNON;
+	/*
+	 * If the target name is a path, make sure we have all of the
+	 * intermediate directories, creating them if necessary.
+	 */
+	dvp = vp;
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		if (va.va_type == VNON)
+			/* use the top-level dir as the template va for mkdir */
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+				VN_RELE(realvp);
+				return (error);
+			}
+
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) {
+			VN_RELE(realvp);
+			return (EINVAL);
+		}
+
+		if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+		    error != EEXIST) {
+			VN_RELE(realvp);
+			return (error);
+		}
+
+		*p = '/';
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	/* Now use the real file's va as the template va */
+	if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/* Make the vnode */
+	error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+	if (error != 0)
+		VN_RELE(realvp);
+	return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error;
+	char *p, *pnm;
+	hlnode_t *parent;
+	hlnode_t *fndtp;
+
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	/*
+	 * If the target name is a path, get the containing dir and simple
+	 * file name.
+	 */
+	parent = (hlnode_t *)VTOHLN(dvp);
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+			return (error);
+
+		dvp = HLNTOV(fndtp);
+		parent = fndtp;
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Remove the entry from the parent dir */
+	return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error = 0;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively remove contents of this subdir */
+			if (fndhp->hln_type == VDIR) {
+				vnode_t *tvp = HLNTOV(fndhp);
+
+				error = hyprlofs_rm_all(tvp, cr, ct, flags);
+				if (error != 0)
+					goto done;
+			}
+		}
+
+		/* remove the entry */
+		error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+		if (error != 0)
+			goto done;
+
+		hdp = hp->hln_dir;
+	}
+
+done:
+	hlnode_rele(hp);
+	return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+    char *prefix, uint_t *pcnt, uint_t n_max,
+    cred_t *cr, caller_context_t *ct, int flags)
+{
+	int error = 0;
+	int too_big = 0;
+	uint_t cnt;
+	uint_t len;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+	char *path;
+
+	cnt = *pcnt;
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+		vnode_t *tvp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively get contents of this subdir */
+			VERIFY(fndhp->hln_type == VDIR);
+			tvp = HLNTOV(fndhp);
+
+			if (*prefix == '\0')
+				(void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+			else
+				(void) snprintf(path, MAXPATHLEN, "%s/%s",
+				    prefix, hdp->hld_name);
+
+			error = hyprlofs_get_all_entries(tvp, hcp, path,
+			    &cnt, n_max, cr, ct, flags);
+
+			if (error == E2BIG) {
+				too_big = 1;
+				error = 0;
+			}
+			if (error != 0)
+				goto done;
+		} else {
+			if (cnt < n_max) {
+				char *p;
+
+				if (*prefix == '\0')
+					(void) strlcpy(path, hdp->hld_name,
+					    MAXPATHLEN);
+				else
+					(void) snprintf(path, MAXPATHLEN,
+					    "%s/%s", prefix, hdp->hld_name);
+
+				len = strlen(path);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(path, (void *)(hcp[cnt].hce_name),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+
+				tvp = REALVP(HLNTOV(fndhp));
+				if (tvp->v_path == vn_vpath_empty) {
+					p = "<unknown>";
+				} else {
+					p = tvp->v_path;
+				}
+				len = strlen(p);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(p, (void *)(hcp[cnt].hce_path),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+			}
+
+			cnt++;
+			if (cnt > n_max)
+				too_big = 1;
+		}
+
+		hdp = hdp->hld_next;
+	}
+
+done:
+	hlnode_rele(hp);
+	kmem_free(path, MAXPATHLEN);
+
+	*pcnt = cnt;
+	if (error == 0 && too_big == 1)
+		error = E2BIG;
+
+	return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	uint_t limit, cnt;
+	int error;
+	model_t model;
+	hyprlofs_curr_entry_t *e;
+
+	model = get_udatamodel();
+
+	if (model == DATAMODEL_NATIVE) {
+		hyprlofs_curr_entries_t ebuf;
+
+		if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+			return (EFAULT);
+		limit = ebuf.hce_cnt;
+		e = ebuf.hce_entries;
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+
+	} else {
+		hyprlofs_curr_entries32_t ebuf32;
+
+		if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+			return (EFAULT);
+
+		limit = ebuf32.hce_cnt;
+		e = (hyprlofs_curr_entry_t *)(unsigned long)
+		    (ebuf32.hce_entries);
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+	}
+
+	cnt = 0;
+	error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+	    flags);
+
+	if (error == 0 || error == E2BIG) {
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_curr_entries_t ebuf;
+
+			ebuf.hce_cnt = cnt;
+			if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+				return (EFAULT);
+
+		} else {
+			hyprlofs_curr_entries32_t ebuf32;
+
+			ebuf32.hce_cnt = cnt;
+			if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+				return (EFAULT);
+		}
+	}
+
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	int error;
+	hlnode_t *hp = NULL;
+
+	/* This holds the hp vnode */
+	error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+	if (error)
+		return (error);
+
+	ASSERT(hp);
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+	rw_exit(&hp->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(hp);
+
+	return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	vnode_t *vp;
+	int error = 0;
+
+	/* Return error if removing . or .. */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&self->hln_rwlock, RW_WRITER);
+
+	vp = HLNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done1;
+	}
+	if (self->hln_type != VDIR) {
+		error = ENOTDIR;
+		goto done1;
+	}
+
+	/*
+	 * When a dir is looped in, we only remove the in-memory dir, not the
+	 * backing dir.
+	 */
+	if (self->hln_looped == 0) {
+		mutex_enter(&self->hln_tlock);
+		if (self->hln_nlink > 2) {
+			mutex_exit(&self->hln_tlock);
+			error = EEXIST;
+			goto done1;
+		}
+		mutex_exit(&self->hln_tlock);
+
+		if (vn_vfswlock(vp)) {
+			error = EBUSY;
+			goto done1;
+		}
+		if (vn_mountedvfs(vp) != NULL) {
+			error = EBUSY;
+			goto done;
+		}
+
+		/*
+		 * Check for an empty directory, i.e. only includes entries for
+		 * "." and ".."
+		 */
+		if (self->hln_dirents > 2) {
+			error = EEXIST;		/* SIGH should be ENOTEMPTY */
+			/*
+			 * Update atime because checking hln_dirents is
+			 * equivalent to reading the directory
+			 */
+			gethrestime(&self->hln_atime);
+			goto done;
+		}
+
+		error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+	} else {
+		error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+	}
+
+done:
+	if (self->hln_looped == 0)
+		vn_vfsunlock(vp);
+done1:
+	rw_exit(&self->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(self);
+
+	return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hldirent_t *hdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	ulong_t outcount = 0;
+	ulong_t bufsize;
+	size_t reclen;
+	caddr_t outbuf;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+	/* assuming syscall has already called hln_rwlock */
+	ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		return (0);
+	}
+
+	/* Get space for multiple dir entries */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+	offset = 0;
+	hdp = hp->hln_dir;
+	while (hdp) {
+		namelen = strlen(hdp->hld_name);	/* no +1 needed */
+		offset = hdp->hld_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount)
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				break;
+			}
+			ASSERT(hdp->hld_hlnode != NULL);
+
+			/* zero out uninitialized bytes */
+			(void) strncpy(dp->d_name, hdp->hld_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+			dp->d_off = (offset_t)hdp->hld_offset + 1;
+			dp = (struct dirent64 *)
+			    ((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		hdp = hdp->hld_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!hdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&hp->hln_atime);
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	mutex_enter(&hp->hln_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's nothing to do except drop our hold.
+	 */
+	if (vp->v_count > 1 || hp->hln_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&hp->hln_tlock);
+		rw_exit(&hp->hln_rwlock);
+		return;
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&hp->hln_tlock);
+
+	/* release hold on the real vnode now */
+	if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+		VN_RELE(hp->hln_realvp);
+
+	/* Here's our chance to send invalid event while we're between locks */
+	vn_invalid(HLNTOV(hp));
+
+	mutex_enter(&hm->hlm_contents);
+	if (hp->hln_forw == NULL)
+		hm->hlm_rootnode->hln_back = hp->hln_back;
+	else
+		hp->hln_forw->hln_back = hp->hln_back;
+	hp->hln_back->hln_forw = hp->hln_forw;
+	mutex_exit(&hm->hlm_contents);
+	rw_exit(&hp->hln_rwlock);
+	rw_destroy(&hp->hln_rwlock);
+	mutex_destroy(&hp->hln_tlock);
+	vn_free(HLNTOV(hp));
+	kmem_free(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfid_t *hfid;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FID(REALVP(vp), fidp, ct));
+
+	if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+		fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+		return (ENOSPC);
+	}
+
+	hfid = (hlfid_t *)fidp;
+	bzero(hfid, sizeof (hlfid_t));
+	hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+	hfid->hlfid_ino = hp->hln_nodeid;
+	hfid->hlfid_gen = hp->hln_gen;
+
+	return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+	    rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0)
+		return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+	return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1)
+		return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+	if (write_lock) {
+		rw_enter(&hp->hln_rwlock, RW_WRITER);
+	} else {
+		rw_enter(&hp->hln_rwlock, RW_READER);
+	}
+	return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1) {
+		VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+		return;
+	}
+
+	rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+	switch (cmd) {
+	case _PC_XATTR_ENABLED:
+	case _PC_XATTR_EXISTS:
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		error = EINVAL;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = hyprlofs_open },
+	VOPNAME_CLOSE,		{ .vop_close = hyprlofs_close },
+	VOPNAME_READ,		{ .vop_read = hyprlofs_read },
+	VOPNAME_WRITE,		{ .vop_write = hyprlofs_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = hyprlofs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = hyprlofs_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = hyprlofs_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = hyprlofs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = hyprlofs_lookup },
+	VOPNAME_CREATE,		{ .error = fs_error },
+	VOPNAME_REMOVE,		{ .vop_remove = hyprlofs_remove },
+	VOPNAME_LINK,		{ .error = fs_error },
+	VOPNAME_RENAME,		{ .error = fs_error },
+	VOPNAME_MKDIR,		{ .error = fs_error },
+	VOPNAME_RMDIR,		{ .vop_rmdir = hyprlofs_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = hyprlofs_readdir },
+	VOPNAME_SYMLINK,	{ .error = fs_error },
+	VOPNAME_READLINK,	{ .error = fs_error },
+	VOPNAME_FSYNC,		{ .vop_fsync = hyprlofs_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = hyprlofs_inactive },
+	VOPNAME_FID,		{ .vop_fid = hyprlofs_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = hyprlofs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = hyprlofs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = hyprlofs_seek },
+	VOPNAME_SPACE,		{ .vop_space = hyprlofs_space },
+	VOPNAME_GETPAGE,	{ .vop_getpage = hyprlofs_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = hyprlofs_putpage },
+	VOPNAME_MAP,		{ .vop_map = hyprlofs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = hyprlofs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = hyprlofs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = hyprlofs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 2e48a21150..71e2aeb48b 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -58,6 +58,7 @@
 #include <sys/zone.h>
 #include <sys/dnlc.h>
 #include <sys/fs/snode.h>
+#include <sys/brand.h>
 
 /* Controls whether paths are stored with vnodes. */
 int vfs_vnode_path = 1;
@@ -244,6 +245,9 @@ lookuppnvp(
 		pp = &presrvd;
 	}
 
+	if (flags & __FLXNOAUTO)
+		lookup_flags |= __FLXNOAUTO;
+
 	if (auditing)
 		audit_anchorpath(pnp, vp == rootvp);
 
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..e19281fc15
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,524 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define	LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	int bufsize = lxpr_bufsize;
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process
+		 */
+		p = prfind((pid == 1) ?
+		    curproc->p_zone->zone_proc_initpid : pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		if (p->p_flag & SEXITING) {
+			/*
+			 * This process is exiting -- let it go.
+			 */
+			mutex_exit(mp);
+			return (NULL);
+		}
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (pid + 1);
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + fd);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		lxpnp->lxpr_pid = ((p->p_pid ==
+		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			ASSERT(MUTEX_HELD(&p->p_lock));
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			ASSERT(MUTEX_HELD(&p->p_lock));
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		break;
+
+	case LXPR_PID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lxproc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialize cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * allocate the first vnode
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+
+	(void) strcpy(sp->f_fstr, "lxproc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..60b3d52f09
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * We have -- confusingly -- two implementations of Linux /proc.  One is to
+ * support the LX brand with a Linux /proc entirely compatible with the Linux
+ * world view; the other -- this one -- is to support native (but Linux-borne)
+ * programs that wish to view the native system via the Linux /proc model.  So
+ * the aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop).  However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth.  A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space.  (That is, each thread has in
+ * effect a pid.)  Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all.  Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility.  In short, this is meant to be a best
+ * effort -- no more -- and as such, it should not be unified with the much
+ * more complete Linux /proc implementation found in the LX brand.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" }
+};
+
+#define	PROCDIRFILES	(sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * 	====================================
+ * 	| Number | Linux      | Native     |
+ * 	| ====== | =========  | ========== |
+ *	|    7   | SIGBUS     | SIGEMT     |
+ *	|   10   | SIGUSR1    | SIGBUS     |
+ *	|   12   | SIGUSR2    | SIGSYS     |
+ *	|   16   | SIGSTKFLT  | SIGUSR1    |
+ *	|   17   | SIGCHLD    | SIGUSR2    |
+ * 	|   18   | SIGCONT    | SIGCHLD    |
+ *	|   19   | SIGSTOP    | SIGPWR     |
+ * 	|   20   | SIGTSTP    | SIGWINCH   |
+ * 	|   21   | SIGTTIN    | SIGURG     |
+ * 	|   22   | SIGTTOU    | SIGPOLL    |
+ *	|   23   | SIGURG     | SIGSTOP    |
+ * 	|   24   | SIGXCPU    | SIGTSTP    |
+ *	|   25   | SIGXFSZ    | SIGCONT    |
+ *	|   26   | SIGVTALARM | SIGTTIN    |
+ *	|   27   | SIGPROF    | SIGTTOU    |
+ *	|   28   | SIGWINCH   | SIGVTALARM |
+ *	|   29   | SIGPOLL    | SIGPROF    |
+ *	|   30   | SIGPWR     | SIGXCPU    |
+ *	|   31   | SIGSYS     | SIGXFSZ    |
+ * 	====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+	0,
+	LX_SIGHUP,
+	LX_SIGINT,
+	LX_SIGQUIT,
+	LX_SIGILL,
+	LX_SIGTRAP,
+	LX_SIGABRT,
+	LX_SIGSTKFLT,
+	LX_SIGFPE,
+	LX_SIGKILL,
+	LX_SIGBUS,
+	LX_SIGSEGV,
+	LX_SIGSYS,
+	LX_SIGPIPE,
+	LX_SIGALRM,
+	LX_SIGTERM,
+	LX_SIGUSR1,
+	LX_SIGUSR2,
+	LX_SIGCHLD,
+	LX_SIGPWR,
+	LX_SIGWINCH,
+	LX_SIGURG,
+	LX_SIGPOLL,
+	LX_SIGSTOP,
+	LX_SIGTSTP,
+	LX_SIGCONT,
+	LX_SIGTTIN,
+	LX_SIGTTOU,
+	LX_SIGVTALRM,
+	LX_SIGPROF,
+	LX_SIGXCPU,
+	LX_SIGXFSZ,
+	-1,			/* 32:  illumos SIGWAITING */
+	-1,			/* 33:  illumos SIGLWP */
+	-1,			/* 34:  illumos SIGFREEZE */
+	-1,			/* 35:  illumos SIGTHAW */
+	-1,			/* 36:  illumos SIGCANCEL */
+	-1,			/* 37:  illumos SIGLOST */
+	-1,			/* 38:  illumos SIGXRES */
+	-1,			/* 39:  illumos SIGJVM1 */
+	-1,			/* 40:  illumos SIGJVM2 */
+	-1,			/* 41:  illumos SIGINFO */
+	LX_SIGRTMIN,		/* 42:  illumos _SIGRTMIN */
+	LX_SIGRTMIN + 1,
+	LX_SIGRTMIN + 2,
+	LX_SIGRTMIN + 3,
+	LX_SIGRTMIN + 4,
+	LX_SIGRTMIN + 5,
+	LX_SIGRTMIN + 6,
+	LX_SIGRTMIN + 7,
+	LX_SIGRTMIN + 8,
+	LX_SIGRTMIN + 9,
+	LX_SIGRTMIN + 10,
+	LX_SIGRTMIN + 11,
+	LX_SIGRTMIN + 12,
+	LX_SIGRTMIN + 13,
+	LX_SIGRTMIN + 14,
+	LX_SIGRTMIN + 15,
+	LX_SIGRTMIN + 16,
+	LX_SIGRTMIN + 17,
+	LX_SIGRTMIN + 18,
+	LX_SIGRTMIN + 19,
+	LX_SIGRTMIN + 20,
+	LX_SIGRTMIN + 21,
+	LX_SIGRTMIN + 22,
+	LX_SIGRTMIN + 23,
+	LX_SIGRTMIN + 24,
+	LX_SIGRTMIN + 25,
+	LX_SIGRTMIN + 26,
+	LX_SIGRTMIN + 27,
+	LX_SIGRTMIN + 28,
+	LX_SIGRTMIN + 29,
+	LX_SIGRTMIN + 30,
+	LX_SIGRTMAX
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/*
+	 * We only allow reading in this file systrem
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	/*
+	 * If we are opening an underlying file only allow regular files
+	 * reject the open for anything but a regular file.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+			error = EACCES;
+		else {
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_empty,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_empty,		/* /proc/devices	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_empty,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_invalid,		/* /proc/kmsg -- see lxpr_read() */
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type < LXPR_NFILES);
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		ldi_handle_t	ldih;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error =
+		    ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+
+		lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+		if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+			return (error);
+	} else {
+		lxpr_read_function[type](lxpnp, uiobuf);
+	}
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context.  This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+	lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+	lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		caddr_t saddr;
+		caddr_t eaddr;
+		int type;
+		char prot[5];
+		uint32_t offset;
+		vnode_t *vp;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		if ((seg->s_flags & S_HOLE) != 0) {
+			continue;
+		}
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = seg->s_base;
+		pbuf->eaddr = seg->s_base+seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		u_longlong_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (*buf != '\0') {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize;
+	size_t rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, RW_READER);
+	vsize = btopr(as->a_resvsize);
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as);
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	user_t *up;
+	cred_t *cr;
+	const gid_t *groups;
+	int    ngroups;
+	struct as *as;
+	char *status;
+	pid_t pid, ppid;
+	size_t vsize;
+	size_t rss;
+	k_sigset_t current, ignore, handle;
+	int    i, lx_sig;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    up->u_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    pid,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    p->p_fno_ctl);
+
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) &&
+	    (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as);
+		mutex_enter(&p->p_lock);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    0l,
+		    ptok(rss),
+		    0l,
+		    btok(p->p_stksize),
+		    ptok(rss),
+		    0l);
+	}
+
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = lxpr_sigmap[i];
+
+		if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "\n"
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n"
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0],
+	    /* Can't do anything with linux capabilities */
+	    0,
+	    0,
+	    0);
+
+	lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri;
+	caddr_t wchan;
+	processorid_t cpu;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = p->p_sessp->s_dev;
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S'; break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R'; break;
+		case TS_ZOMB:
+			stat = 'Z'; break;
+		case TS_STOPPED:
+			stat = 'T'; break;
+		default:
+			stat = '!'; break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+		thread_unlock(t);
+	} else {
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	AS_LOCK_ENTER(as, RW_READER);
+	vsize = as->a_resvsize;
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as);
+	mutex_enter(&p->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d (%s) %c %d %d %d %d %d "
+	    "%lu %lu %lu %lu %lu "
+	    "%lu %lu %ld %ld "
+	    "%d %d %d "
+	    "%lu "
+	    "%lu "
+	    "%lu %ld %llu "
+	    "%lu %lu %u "
+	    "%lu %lu "
+	    "%lu %lu %lu %lu "
+	    "%lu "
+	    "%lu %lu "
+	    "%d "
+	    "%d"
+	    "\n",
+	    pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+	    pri, nice, p->p_lwpcnt,
+	    0l, /* itrealvalue (time before next SIGALRM) */
+	    PTOU(p)->u_ticks,
+	    vsize, rss, p->p_vmem_ctl,
+	    0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+	    0l, 0l, /* kstkesp, kstkeip */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+	    wchan,
+	    0l, 0l, /* nswap, cnswap */
+	    0, /* exit_signal */
+	    cpu);
+
+	lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	/*
+	 * Data about each interface should go here, but that shouldn't be added
+	 * unless there is an lxproc reader that actually makes use of it (and
+	 * doesn't need anything else that we refuse to provide)...
+	 */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+	mblk_t		*mp;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+	if (ldi_getmsg(lh, &mp, NULL) == 0) {
+		/*
+		 * lxproc doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	ulong_t total_mem, free_mem, total_swap, used_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem,
+	    (pgcnt_t *)&free_mem);
+	total_mem = ptob(total_mem);
+	free_mem = ptob(free_mem);
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = ptob(k_anoninfo.ani_max);
+		used_swap = ptob(k_anoninfo.ani_phys_resv);
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		used_swap = zone->zone_max_swap;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "        total:     used:    free:  shared: buffers:  cached:\n"
+	    "Mem:  %8lu %8lu %8lu %8u %8u %8u\n"
+	    "Swap: %8lu %8lu %8lu\n"
+	    "MemTotal:     %8lu kB\n"
+	    "MemFree:      %8lu kB\n"
+	    "MemAvailable: %8lu kB\n"
+	    "MemShared:    %8u kB\n"
+	    "Buffers:      %8u kB\n"
+	    "Cached:       %8u kB\n"
+	    "SwapCached:   %8u kB\n"
+	    "Active:       %8u kB\n"
+	    "Inactive:     %8u kB\n"
+	    "HighTotal:    %8u kB\n"
+	    "HighFree:     %8u kB\n"
+	    "LowTotal:     %8u kB\n"
+	    "LowFree:      %8u kB\n"
+	    "SwapTotal:    %8lu kB\n"
+	    "SwapFree:     %8lu kB\n",
+	    total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+	    total_swap, used_swap, total_swap - used_swap,
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    btok(free_mem),				/* MemAvailable */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap - used_swap));		/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	zone_t *zone = LXPTOZ(lxpnp);
+	struct print_data {
+		refstr_t *vfs_mntpt;
+		refstr_t *vfs_resource;
+		uint_t vfs_flag;
+		int vfs_fstype;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *printp;
+
+	vfs_list_read_lock();
+
+	if (zone == global_zone) {
+		vfsp = vfslist = rootvfs;
+	} else {
+		vfsp = vfslist = zone->zone_vfslist;
+		/*
+		 * If the zone has a root entry, it will be the first in
+		 * the list.  If it doesn't, we conjure one up.
+		 */
+		if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+		    zone->zone_rootpath) != 0) {
+			struct vfs *tvfsp;
+			/*
+			 * The root of the zone is not a mount point.  The vfs
+			 * we want to report is that of the zone's root vnode.
+			 */
+			tvfsp = zone->zone_rootvp->v_vfsp;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "/ / %s %s 0 0\n",
+			    vfssw[tvfsp->vfs_fstype].vsw_name,
+			    tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+		}
+		if (vfslist == NULL) {
+			vfs_list_unlock();
+			return;
+		}
+	}
+
+	/*
+	 * Later on we have to do a lookupname, which can end up causing
+	 * another vfs_list_read_lock() to be called. Which can lead to a
+	 * deadlock. To avoid this, we extract the data we need into a local
+	 * list, then we can run this list without holding vfs_list_read_lock()
+	 * We keep the list in the same order as the vfs_list
+	 */
+	do {
+		/* Skip mounts we shouldn't show */
+		if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+			goto nextfs;
+		}
+
+		printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+		refstr_hold(vfsp->vfs_mntpt);
+		printp->vfs_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_resource);
+		printp->vfs_resource = vfsp->vfs_resource;
+		printp->vfs_flag = vfsp->vfs_flag;
+		printp->vfs_fstype = vfsp->vfs_fstype;
+		printp->next = NULL;
+
+		*print_tail = printp;
+		print_tail = &printp->next;
+
+nextfs:
+		vfsp = (zone == global_zone) ?
+		    vfsp->vfs_next : vfsp->vfs_zone_next;
+
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	printp = print_head;
+	while (printp != NULL) {
+		struct print_data *printp_next;
+		const char *resource;
+		char *mntpt;
+		struct vnode *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(printp->vfs_mntpt);
+		resource = refstr_value(printp->vfs_resource);
+
+		if (mntpt != NULL && mntpt[0] != '\0')
+			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		else
+			mntpt = "-";
+
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+		if (error != 0)
+			goto nextp;
+
+		if (!(vp->v_flag & VROOT)) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) :
+				    mntpt;
+			}
+		} else {
+			resource = "-";
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "%s %s %s %s 0 0\n",
+		    resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+		    printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+		printp_next = printp->next;
+		refstr_rele(printp->vfs_mntpt);
+		refstr_rele(printp->vfs_resource);
+		kmem_free(printp, sizeof (*printp));
+		printp = printp_next;
+
+	}
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices.  But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "major minor  #blocks  name     rio rmerge rsect ruse "
+	    "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.  Note that
+ * we don't lie here -- we don't pretend that we're Linux.  If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) "
+	    "#%s SMP %s\n",
+	    utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+	    "gcc",
+	    __GNUC__,
+	    __GNUC_MINOR__,
+	    __GNUC_PATCHLEVEL__,
+#else
+	    "Sun C",
+	    __SUNPRO_C / 0x100,
+	    (__SUNPRO_C & 0xff) / 0x10,
+	    __SUNPRO_C & 0xf,
+#endif
+	    utsname.version,
+	    "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	ulong_t cpu_nrunnable_cum = 0;
+	ulong_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+		w_io_cum += CPU_STATS(cp, sys.iowait);
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_cum += NSEC_TO_TICK(tmptime);
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+	    user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+		    cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+		    0L, irq_ticks, 0L);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "page %lu %lu\n"
+	    "swap %lu %lu\n"
+	    "intr %lu\n"
+	    "ctxt %lu\n"
+	    "btime %lu\n"
+	    "processes %lu\n"
+	    "procs_running %lu\n"
+	    "procs_blocked %lu\n",
+	    pgpgin_cum, pgpgout_cum,
+	    pgswapin_cum, pgswapout_cum,
+	    intr_cum,
+	    pswitch_cum,
+	    boot_time,
+	    forks_cum,
+	    cpu_nrunnable_cum,
+	    w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = vp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is a read only file system */
+	if (mode & VWRITE)
+		return (EROFS);
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (lxpnp->lxpr_type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	vnode_t *vp = NULL;
+	proc_t *p;
+	file_t *fp;
+	uint_t fd;
+	int c;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	/*
+	 * convert the string rendition of the filename
+	 * to a file descriptor
+	 */
+	fd = 0;
+	while ((c = *comp++) != '\0') {
+		int ofd;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		ofd = fd;
+		fd = 10*fd + c - '0';
+		/* integer overflow */
+		if (fd / 10 != ofd)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	p = lxpr_lock(dlxpnp->lxpr_pid);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * If the process is a zombie or system process
+	 * it can't have any open files.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+	    (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/*
+	 * get us a fresh node/vnode
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we dereference into fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * get open file info
+	 */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		/*
+		 * ensure the fd is still kosher.
+		 * it may have gone between the readdir and
+		 * the lookup
+		 */
+		if (fip->fi_list[fd].uf_file == NULL) {
+			mutex_exit(&fip->fi_lock);
+			UF_EXIT(ufp);
+			mutex_enter(&p->p_lock);
+			lxpr_unlock(p);
+			lxpr_freenode(lxpnp);
+			return (NULL);
+		}
+
+		if ((fp = ufp->uf_file) != NULL)
+			vp = fp->f_vnode;
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+
+	if (vp == NULL) {
+		mutex_enter(&p->p_lock);
+		lxpr_unlock(p);
+		lxpr_freenode(lxpnp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxpnp->lxpr_realvp = vp;
+		VN_HOLD(lxpnp->lxpr_realvp);
+	}
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lxpr node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			VERIFY(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i < 0 || i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc
+		 * structure
+		 */
+		pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+		    p->p_pid : 1);
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	p = prfind((lxpnp->lxpr_pid == 1) ?
+	    curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+	int ceof;
+	proc_t *p;
+	int fddirsize = -1;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL)
+		return (ENOENT);
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) ||
+	    (p->p_as == &kas))
+		fddirsize = 0;
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we iterate over its fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fddirsize == -1)
+		fddirsize = fip->fi_nfiles;
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		goto out;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd < 0 || fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/* must be a symbolic link file */
+	if (vp->v_type != VLNK)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+			return (error);
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process
+			 */
+			pid = ((curproc->p_pid !=
+			    curproc->p_zone->zone_proc_initpid)
+			    ? curproc->p_pid : 1);
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..eadb2ccd27
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,278 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef	_LXPROC_H
+#define	_LXPROC_H
+
+#ifdef _LXPROC_BRANDED_H
+#error Attempted to include native lxproc.h after branded lx_proc.h
+#endif
+
+#define	_LXPROC_NATIVE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define	LX_SIGHUP	1
+#define	LX_SIGINT	2
+#define	LX_SIGQUIT	3
+#define	LX_SIGILL	4
+#define	LX_SIGTRAP	5
+#define	LX_SIGABRT	6
+#define	LX_SIGIOT	6
+#define	LX_SIGBUS	7
+#define	LX_SIGFPE	8
+#define	LX_SIGKILL	9
+#define	LX_SIGUSR1	10
+#define	LX_SIGSEGV	11
+#define	LX_SIGUSR2	12
+#define	LX_SIGPIPE	13
+#define	LX_SIGALRM	14
+#define	LX_SIGTERM	15
+#define	LX_SIGSTKFLT	16
+#define	LX_SIGCHLD	17
+#define	LX_SIGCONT	18
+#define	LX_SIGSTOP	19
+#define	LX_SIGTSTP	20
+#define	LX_SIGTTIN	21
+#define	LX_SIGTTOU	22
+#define	LX_SIGURG	23
+#define	LX_SIGXCPU	24
+#define	LX_SIGXFSZ	25
+#define	LX_SIGVTALRM	26
+#define	LX_SIGPROF	27
+#define	LX_SIGWINCH	28
+#define	LX_SIGIO	29
+#define	LX_SIGPOLL	LX_SIGIO
+#define	LX_SIGPWR	30
+#define	LX_SIGSYS	31
+#define	LX_SIGUNUSED	31
+
+#define	LX_NSIG		64	/* Linux _NSIG */
+
+#define	LX_SIGRTMIN	32
+#define	LX_SIGRTMAX	LX_NSIG
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define	LXPRMAXNAMELEN	14
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index d6a88a97c3..f6c6b62925 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index 74d47dd93d..279cc60877 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -29,7 +29,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2022 Oxide Computer Company
  */
@@ -3354,10 +3354,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -5537,8 +5536,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfs3setattr(vp, &va, 0, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index f0320aaee0..25088aafcb 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 2a501bc898..b57ad066e4 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -38,7 +38,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2022 Oxide Computer Company
  */
 
@@ -3757,8 +3757,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 	 */
 	error = nfs4setattr(vp, vap, flags, cr, NULL);
 
-	if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0)
-		vnevent_truncate(vp, ct);
+	if (error == 0 && (vap->va_mask & AT_SIZE)) {
+		if (vap->va_size == 0) {
+			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
+	}
 
 	return (error);
 }
@@ -8074,8 +8079,9 @@ link_call:
 	 * vnode if it already existed.
 	 */
 	if (error == 0) {
-		vnode_t *tvp;
+		vnode_t *tvp, *tovp;
 		rnode4_t *trp;
+
 		/*
 		 * Notify the vnode. Each links is represented by
 		 * a different vnode, in nfsv4.
@@ -8088,23 +8094,20 @@ link_call:
 			vnevent_rename_dest(tvp, ndvp, nnm, ct);
 		}
 
-		/*
-		 * if the source and destination directory are not the
-		 * same notify the destination directory.
-		 */
-		if (VTOR4(odvp) != VTOR4(ndvp)) {
-			trp = VTOR4(ndvp);
-			tvp = ndvp;
-			if (IS_SHADOW(ndvp, trp))
-				tvp = RTOV4(trp);
-			vnevent_rename_dest_dir(tvp, ct);
-		}
-
 		trp = VTOR4(ovp);
-		tvp = ovp;
+		tovp = ovp;
 		if (IS_SHADOW(ovp, trp))
+			tovp = RTOV4(trp);
+
+		vnevent_rename_src(tovp, odvp, onm, ct);
+
+		trp = VTOR4(ndvp);
+		tvp = ndvp;
+
+		if (IS_SHADOW(ndvp, trp))
 			tvp = RTOV4(trp);
-		vnevent_rename_src(tvp, odvp, onm, ct);
+
+		vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -11021,8 +11024,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfs4setattr(vp, &va, 0, cr, NULL);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs_sys.c b/usr/src/uts/common/fs/nfs/nfs_sys.c
index 434c9a2a3e..8048d13ca3 100644
--- a/usr/src/uts/common/fs/nfs/nfs_sys.c
+++ b/usr/src/uts/common/fs/nfs/nfs_sys.c
@@ -30,6 +30,7 @@
  */
 
 /*
+ * Copyright 2017 Joyent, Inc.
  * Copyright 2018 Nexenta Systems, Inc.
  */
 
@@ -242,7 +243,7 @@ nfssys(enum nfssys_op opcode, void *arg)
 			lsa.n_fmly = STRUCT_FGET(ulsa, n_fmly);
 			lsa.n_proto = STRUCT_FGET(ulsa, n_proto);
 			lsa.n_rdev = expldev(STRUCT_FGET(ulsa, n_rdev));
-			lsa.debug = STRUCT_FGET(ulsa, debug);
+			lsa.n_v4_only = STRUCT_FGET(ulsa, n_v4_only);
 			lsa.timout = STRUCT_FGET(ulsa, timout);
 			lsa.grace = STRUCT_FGET(ulsa, grace);
 			lsa.retransmittimeout = STRUCT_FGET(ulsa,
diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
index c9cc306f95..5041ebb6fe 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  *
  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *	All rights reserved.
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 1a1082bcb8..ee3bac484f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -26,7 +26,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
 
 	error = nfssetattr(vp, vap, flags, cr);
 
-	if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
-		vnevent_truncate(vp, ct);
+	if (error == 0 && (mask & AT_SIZE)) {
+		if (vap->va_size == 0) {
+			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
+	}
 
 	return (error);
 }
@@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
-
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
@@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 			va.va_size = bfp->l_start;
 			error = nfssetattr(vp, &va, 0, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else
 			error = EINVAL;
 	}
diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c
index 976715e346..275330a0ae 100644
--- a/usr/src/uts/common/fs/pcfs/pc_dir.c
+++ b/usr/src/uts/common/fs/pcfs/pc_dir.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -826,8 +826,7 @@ top:
 
 	if (error == 0) {
 		vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
-		if (dp != tdp)
-			vnevent_rename_dest_dir(PCTOV(tdp), ctp);
+		vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp);
 	}
 
 done:
diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c
index 013a6d3352..1965444071 100644
--- a/usr/src/uts/common/fs/pcfs/pc_vnops.c
+++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c
@@ -782,8 +782,11 @@ pcfs_setattr(
 		if (error)
 			goto out;
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
 	}
 	/*
 	 * Change file modified times.
diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c
index c9c417fda8..e11d5c8be4 100644
--- a/usr/src/uts/common/fs/portfs/port_fop.c
+++ b/usr/src/uts/common/fs/portfs/port_fop.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
  * Copyright 2022 Oxide Computer Company
  */
 
@@ -540,14 +540,14 @@ port_fop_trimpfplist(vnode_t *vp)
 			port_pcache_remove_fop(pfcp, pfp);
 			mutex_exit(&pfcp->pfc_lock);
 			if (tdvp != NULL)
-				VN_RELE(tdvp);
+				VN_PHANTOM_RELE(tdvp);
 		}
 	}
 }
 
 /*
  * This routine returns 1, if the vnode can be rele'ed by the caller.
- * The caller has to VN_RELE the vnode with out holding any
+ * The caller has to VN_PHANTOM_RELE the vnode with out holding any
  * locks.
  */
 int
@@ -617,7 +617,7 @@ port_fop_femuninstall(vnode_t *vp)
  * able to remove it from the port's queue).
  *
  * vpp and dvpp will point to the vnode and directory vnode which the caller
- * is required to VN_RELE without holding any locks.
+ * is required to VN_PHANTOM_RELE without holding any locks.
  */
 int
 port_remove_fop(portfop_t *pfp, portfop_cache_t *pfcp, int cleanup,
@@ -727,8 +727,8 @@ port_cache_lookup_fop(portfop_cache_t *pfcp, pid_t pid, uintptr_t obj)
 
 /*
  * Given the file name, get the vnode and also the directory vnode
- * On return, the vnodes are held (VN_HOLD). The caller has to VN_RELE
- * the vnode(s).
+ * On return, the vnodes are held with phantom holds (VN_PHANTOM_HOLD). The
+ * caller has to VN_PHANTOM_RELE the vnode(s).
  */
 int
 port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, char **cname,
@@ -778,6 +778,17 @@ port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, char **cname,
 		}
 	}
 
+	/* Trade VN_HOLD()s from lookuppn with VN_PHANTOM_HOLD()s */
+	if (dvp != NULL && *dvp != NULL) {
+		VN_PHANTOM_HOLD(*dvp);
+		VN_RELE(*dvp);
+	}
+
+	if (vp != NULL && *vp != NULL) {
+		VN_PHANTOM_HOLD(*vp);
+		VN_RELE(*vp);
+	}
+
 	pn_free(&pn);
 	return (error);
 }
@@ -1177,7 +1188,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp,
 					 * Hold a reference to the vnode since
 					 * we successfully installed the hooks.
 					 */
-					VN_HOLD(vp);
+					VN_PHANTOM_HOLD(vp);
 				} else {
 					(void) fem_uninstall(vp, femp, vp);
 					pvp->pvp_femp = NULL;
@@ -1210,7 +1221,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp,
 	 * Hold the directory vnode since we have a reference now.
 	 */
 	if (dvp != NULL)
-		VN_HOLD(dvp);
+		VN_PHANTOM_HOLD(dvp);
 	*pfpp = pfp;
 	return (0);
 }
@@ -1225,9 +1236,9 @@ port_resolve_vp(vnode_t *vp)
 	 */
 	if (vfs_mntdummyvp && mntfstype != 0 &&
 	    vp->v_vfsp->vfs_fstype == mntfstype) {
-		VN_RELE(vp);
+		VN_PHANTOM_RELE(vp);
 		vp = vfs_mntdummyvp;
-		VN_HOLD(vfs_mntdummyvp);
+		VN_PHANTOM_HOLD(vfs_mntdummyvp);
 	}
 
 	/*
@@ -1235,8 +1246,8 @@ port_resolve_vp(vnode_t *vp)
 	 * hardlinks.
 	 */
 	if ((VOP_REALVP(vp, &rvp, NULL) == 0) && vp != rvp) {
-		VN_HOLD(rvp);
-		VN_RELE(vp);
+		VN_PHANTOM_HOLD(rvp);
+		VN_PHANTOM_RELE(vp);
 		vp = rvp;
 	}
 	return (vp);
@@ -1248,10 +1259,10 @@ port_resolve_vp(vnode_t *vp)
  * The association is identified by the object pointer and the pid.
  * The events argument contains the events to be monitored for.
  *
- * The vnode will have a VN_HOLD once the fem hooks are installed.
+ * The vnode will have a VN_PHANTOM_HOLD once the fem hooks are installed.
  *
- * Every reference(pfp) to the directory vnode will have a VN_HOLD to ensure
- * that the directory vnode pointer does not change.
+ * Every reference(pfp) to the directory vnode will have a VN_PHANTOM_HOLD to
+ * ensure that the directory vnode pointer does not change.
  */
 int
 port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
@@ -1331,7 +1342,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
 	 */
 	if (dvp != NULL && dvp->v_vfsp != vp->v_vfsp &&
 	    !(orig->v_type == VPROC && vp != NULL && vp->v_type != VPROC)) {
-		VN_RELE(dvp);
+		VN_PHANTOM_RELE(dvp);
 		dvp = NULL;
 	}
 
@@ -1351,8 +1362,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
 	pfp = port_cache_lookup_fop(pfcp, curproc->p_pid, object);
 
 	/*
-	 * If it is not the same vnode, just discard it. VN_RELE needs to be
-	 * called with no locks held, therefore save vnode pointers and
+	 * If it is not the same vnode, just discard it. VN_PHANTOM_RELE needs
+	 * to be called with no locks held, therefore save vnode pointers and
 	 * vn_rele them later.
 	 */
 	if (pfp != NULL && (pfp->pfop_vp != vp || pfp->pfop_dvp != dvp)) {
@@ -1405,7 +1416,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
 				 * This vnode pointer is just used
 				 * for comparison, so rele it
 				 */
-				VN_RELE(tvp);
+				VN_PHANTOM_RELE(tvp);
 			}
 		}
 
@@ -1438,8 +1449,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
 			 * active and it is not being removed from
 			 * the vnode list. This is checked in
 			 * port_remove_fop with the vnode lock held.
-			 * The vnode returned is VN_RELE'ed after dropping
-			 * the locks.
+			 * The vnode returned is VN_PHANTOM_RELE'ed after
+			 * dropping the locks.
 			 */
 			tdvp = tvp = NULL;
 			if (port_remove_fop(pfp, pfcp, 0, NULL, &tvp, &tdvp)) {
@@ -1452,9 +1463,9 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
 			}
 			mutex_exit(&pfcp->pfc_lock);
 			if (tvp != NULL)
-				VN_RELE(tvp);
+				VN_PHANTOM_RELE(tvp);
 			if (tdvp != NULL)
-				VN_RELE(tdvp);
+				VN_PHANTOM_RELE(tdvp);
 			goto errout;
 		}
 	} else {
@@ -1521,14 +1532,14 @@ errout:
 	 * Release the hold acquired due to the lookup operation.
 	 */
 	if (vp != NULL)
-		VN_RELE(vp);
+		VN_PHANTOM_RELE(vp);
 	if (dvp != NULL)
-		VN_RELE(dvp);
+		VN_PHANTOM_RELE(dvp);
 
 	if (oldvp != NULL)
-		VN_RELE(oldvp);
+		VN_PHANTOM_RELE(oldvp);
 	if (olddvp != NULL)
-		VN_RELE(olddvp);
+		VN_PHANTOM_RELE(olddvp);
 
 	/*
 	 * copied file name not used, free it.
@@ -1589,9 +1600,9 @@ port_dissociate_fop(port_t *pp, uintptr_t object)
 	(void) port_remove_fop(pfp, pfcp, 1, &active, &tvp, &tdvp);
 	mutex_exit(&pfcp->pfc_lock);
 	if (tvp != NULL)
-		VN_RELE(tvp);
+		VN_PHANTOM_RELE(tvp);
 	if (tdvp != NULL)
-		VN_RELE(tdvp);
+		VN_PHANTOM_RELE(tdvp);
 	return (active ? 0 : ENOENT);
 }
 
@@ -1629,7 +1640,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose)
 	 * be possible as the port is being closed.
 	 *
 	 * The common case is that the port is not shared and all the entries
-	 * are of this pid and have to be freed. Since VN_RELE has to be
+	 * are of this pid and have to be freed. Since VN_PHANTOM_RELE has to be
 	 * called outside the lock, we do it in batches.
 	 */
 	hashtbl = (portfop_t **)pfcp->pfc_hash;
@@ -1656,14 +1667,14 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose)
 		if (pfp == NULL)
 			index++;
 		/*
-		 * Now call VN_RELE if we have collected enough vnodes or
-		 * we have reached the end of the hash table.
+		 * Now call VN_PHANTOM_RELE if we have collected enough vnodes
+		 * or we have reached the end of the hash table.
 		 */
 		if (i >= (PORTFOP_NVP - 1) ||
 		    (i > 0 && index == PORTFOP_HASHSIZE)) {
 			mutex_exit(&pfcp->pfc_lock);
 			while (i > 0) {
-				VN_RELE(vpl[--i]);
+				VN_PHANTOM_RELE(vpl[--i]);
 				vpl[i] = NULL;
 			}
 			mutex_enter(&pfcp->pfc_lock);
@@ -1771,7 +1782,7 @@ port_fop_excep(list_t *tlist, int op)
 		port_pcache_remove_fop(pfcp, pfp);
 		mutex_exit(&pfcp->pfc_lock);
 		if (tdvp != NULL)
-			VN_RELE(tdvp);
+			VN_PHANTOM_RELE(tdvp);
 	}
 }
 
@@ -1935,7 +1946,7 @@ port_fop_sendevent(vnode_t *vp, int events, vnode_t *dvp, char *cname)
 		 * that may be attempting to remove an object from the vnode's.
 		 */
 		if (port_fop_femuninstall(vp))
-			VN_RELE(vp);
+			VN_PHANTOM_RELE(vp);
 
 		/*
 		 * Send exception events and discard the watch entries.
@@ -2070,7 +2081,7 @@ port_fop_unmount(fsemarg_t *vf, int flag, cred_t *cr)
 		 * unmount is in process.
 		 */
 		port_fop_sendevent(pvp->pvp_vp, UNMOUNTED, NULL, NULL);
-		VN_RELE(pvp->pvp_vp);
+		VN_PHANTOM_RELE(pvp->pvp_vp);
 	}
 
 	error = vfsnext_unmount(vf, flag, cr);
diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c
new file mode 100644
index 0000000000..60d098d125
--- /dev/null
+++ b/usr/src/uts/common/fs/proc/prargv.c
@@ -0,0 +1,530 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+
+/*
+ * Safely read a contiguous region of memory from 'addr' in the address space
+ * of a particular process into the supplied kernel buffer (*buf, sz).
+ * Partially mapped regions will result in a partial read terminating at the
+ * first hole in the address space.  The number of bytes actually read is
+ * returned to the caller via 'rdsz'.
+ */
+int
+prreadbuf(proc_t *p, uintptr_t ustart, char *buf, size_t sz, size_t *rdsz)
+{
+	int error = 0;
+	size_t rem = sz;
+	off_t pos = 0;
+
+	if (rdsz != NULL)
+		*rdsz = 0;
+
+	while (rem != 0) {
+		uintptr_t addr = ustart + pos;
+		size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+		if ((error = uread(p, buf + pos, len, addr)) != 0) {
+			if (error == ENXIO) {
+				/*
+				 * ENXIO from uread() indicates that the page
+				 * does not exist.  This will simply be a
+				 * partial read.
+				 */
+				error = 0;
+			}
+			break;
+		}
+
+		rem -= len;
+		pos += len;
+	}
+
+	if (rdsz != NULL)
+		*rdsz = pos;
+
+	return (error);
+}
+
+
+/*
+ * Effectively a truncating version of copyinstr().
+ *
+ * The resulting string is guaranteed to be truncated to fit within the buffer
+ * (hence sz == 0 is not supported). The returned size includes the truncating
+ * NUL.
+ */
+int
+prreadstr(proc_t *p, uintptr_t ustart, char *buf, size_t bufsz, size_t *rdsz)
+{
+	size_t slen;
+	int err;
+
+	VERIFY(bufsz != 0);
+
+	if ((err = prreadbuf(p, ustart, buf, bufsz, &slen)) != 0)
+		return (err);
+
+	slen = strnlen(buf, slen);
+
+	if (slen == bufsz)
+		slen--;
+
+	buf[slen++] = '\0';
+
+	if (rdsz != NULL)
+		*rdsz = slen;
+	return (0);
+}
+
+/*
+ * /proc/pid/cmdline: Linux-compatible '\0'-separated process argv.
+ *
+ * Unlike /proc/pid/argv, this looks at the exec()-time argv string area, rather
+ * than starting from the argv[] array. Thus changes to the array are not
+ * noticed, but direct modifications of the string are visible here. Since it's
+ * common for applications to expect it, we implement the Linux semantics here.
+ *
+ * There is special handling if the process has modified its argv: if the last
+ * byte of the argv string area is no longer NUL, then we presume that it has
+ * done setproctitle() or similar, and we should copy it as a single string from
+ * the start, even though it overflows into the env string area. Note that we
+ * can't use copyinstr() as that returns ENAMETOOLONG rather than truncating as
+ * we need.
+ *
+ * Otherwise, we provide the argv string area in toto.
+ */
+int
+prreadcmdline(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	user_t *up = &p->p_user;
+	uint8_t term;
+	int err = 0;
+
+	VERIFY(bufsz == PRMAXARGVLEN);
+	VERIFY(MUTEX_HELD(&p->p_lock));
+
+	if ((p->p_flag & SSYS) || p->p_as == &kas || up->u_argvstrsize == 0) {
+		bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+		buf[bufsz - 1] = '\0';
+		*slen = strlen(buf) + 1;
+		return (0);
+	}
+
+	VERIFY(up->u_argvstrs != (uintptr_t)NULL);
+
+	mutex_exit(&p->p_lock);
+
+	if (uread(p, &term, sizeof (term),
+	    up->u_argvstrs + up->u_argvstrsize - 1) != 0) {
+		err = EFAULT;
+		goto out;
+	}
+
+	if (term != '\0') {
+		err = prreadstr(p, up->u_argvstrs, buf, bufsz, slen);
+	} else {
+		size_t size = MIN(bufsz, up->u_argvstrsize);
+		err = prreadbuf(p, up->u_argvstrs, buf, size, slen);
+	}
+
+out:
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	return (err);
+}
+
+
+/*
+ * Attempt to read the argument vector (argv) from this process.  The caller
+ * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via
+ * prlock or lx_prlock).
+ *
+ * The caller must provide a buffer (buf, buflen).  We will concatenate each
+ * argument string (including the NUL terminator) into this buffer.  The number
+ * of characters written to this buffer (including the final NUL terminator)
+ * will be stored in 'slen'.
+ */
+int
+prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *argv = NULL;
+	size_t argvsz = 0;
+	int i;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_argv == (uintptr_t)NULL) {
+		/*
+		 * Return the regular psargs string to the caller.
+		 */
+		bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+		buf[bufsz - 1] = '\0';
+		*slen = strlen(buf) + 1;
+
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store argv array.
+	 */
+	argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	argv = kmem_alloc(argvsz, KM_SLEEP);
+
+	/*
+	 * Extract the argv array from the target process.  Drop p_lock
+	 * while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+	if ((error = prreadbuf(p, up->u_argv, (char *)argv,
+	    argvsz, NULL)) != 0) {
+		kmem_free(argv, argvsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each argument string from the pointers in the argv array.
+	 */
+	pos = 0;
+	for (i = 0; i < up->u_argc; i++) {
+		size_t rdsz, trysz;
+		uintptr_t arg;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			arg = (uintptr_t)((caddr32_t *)argv)[i];
+		} else {
+			arg = (uintptr_t)argv[i];
+		}
+#else
+		arg = (uintptr_t)argv[i];
+#endif
+
+		/*
+		 * Stop trying to read arguments if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (arg == (uintptr_t)NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual argument strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this argument.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, arg, (char *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(argv, argvsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
+
+/*
+ * Similar to prreadargv except reads the env vector. This is slightly more
+ * complex because there is no count for the env vector that corresponds to
+ * u_argc.
+ */
+int
+prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *envp = NULL;
+	uintptr_t tmpp = (uintptr_t)NULL;
+	size_t envpsz = 0, rdsz = 0;
+	int i;
+	int cnt, bound;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_envp == (uintptr_t)NULL) {
+		/*
+		 * Return empty string.
+		 */
+		buf[0] = '\0';
+		*slen = 1;
+
+		return (0);
+	}
+
+	/*
+	 * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * We first have to count how many env entries we have. This is
+	 * somewhat painful. We extract the env entries from the target process
+	 * one entry at a time. Stop trying to read env entries if we reach a
+	 * NULL pointer in the vector or hit our upper bound (which we take
+	 * as the bufsz/4) to ensure we don't run off.
+	 */
+	rdsz = (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	bound = (int)(bufsz / 4);
+	for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) {
+		caddr_t tmp = NULL;
+
+		if ((error = prreadbuf(p, tmpp, (char *)&tmp, rdsz,
+		    NULL)) != 0) {
+			mutex_enter(&p->p_lock);
+			VERIFY(p->p_proc_flag & P_PR_LOCK);
+			return (-1);
+		}
+
+		if (tmp == NULL)
+			break;
+	}
+	if (cnt == 0) {
+		/* Return empty string. */
+		buf[0] = '\0';
+		*slen = 1;
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store env array.
+	 */
+	envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	envp = kmem_alloc(envpsz, KM_SLEEP);
+
+	/*
+	 * Extract the env array from the target process.
+	 */
+	if ((error = prreadbuf(p, up->u_envp, (char *)envp, envpsz,
+	    NULL)) != 0) {
+		kmem_free(envp, envpsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each env string from the pointers in the env array.
+	 */
+	pos = 0;
+	for (i = 0; i < cnt; i++) {
+		size_t rdsz, trysz;
+		uintptr_t ev;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			ev = (uintptr_t)((caddr32_t *)envp)[i];
+		} else {
+			ev = (uintptr_t)envp[i];
+		}
+#else
+		ev = (uintptr_t)envp[i];
+#endif
+
+		/*
+		 * Stop trying to read env entries if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (ev == (uintptr_t)NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual env strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this env var.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, ev, (char *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(envp, envpsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 6b151a6369..07dcb1e7db 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
 		} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
 			/* If SIGKILL, set stopped lwp running */
 			p->p_stopsig = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			t->t_dtrace_stop = 0;
 			setrun_locked(t);
 		}
@@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr)
 		return (EPERM);
 	if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id)
 		return (EINVAL);
-	if ((zptr = zone_find_by_id(zoneid)) == NULL)
-		return (EINVAL);
+	/*
+	 * We cannot hold p_lock when we call zone_find_by_id since that can
+	 * lead to a deadlock. zone_find_by_id() takes zonehash_lock.
+	 * zone_enter() can hold the zonehash_lock and needs p_lock when it
+	 * calls task_join.
+	 */
 	mutex_exit(&p->p_lock);
+	if ((zptr = zone_find_by_id(zoneid)) == NULL) {
+		mutex_enter(&p->p_lock);
+		return (EINVAL);
+	}
 	mutex_enter(&p->p_crlock);
 	oldcred = p->p_cred;
 	crhold(oldcred);
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index a661478c50..6d8ac2e6ed 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -27,7 +27,7 @@
 /*	  All Rights Reserved	*/
 
 /*
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  */
 
@@ -124,6 +124,8 @@ typedef enum prnodetype {
 #if defined(__i386) || defined(__amd64)
 	PR_LDT,			/* /proc/<pid>/ldt			*/
 #endif
+	PR_ARGV,		/* /proc/<pid>/argv			*/
+	PR_CMDLINE,		/* /proc/<pid>/cmdline			*/
 	PR_USAGE,		/* /proc/<pid>/usage			*/
 	PR_LUSAGE,		/* /proc/<pid>/lusage			*/
 	PR_PAGEDATA,		/* /proc/<pid>/pagedata			*/
@@ -355,6 +357,9 @@ extern	file_t	*pr_getf(proc_t *, uint_t, short *);
 extern	void	pr_releasef(file_t *);
 extern	void	pr_setfault(proc_t *, fltset_t *);
 extern	int	prusrio(proc_t *, enum uio_rw, struct uio *, int);
+extern	int	prreadargv(proc_t *, char *, size_t, size_t *);
+extern	int	prreadcmdline(proc_t *, char *, size_t, size_t *);
+extern	int	prreadenvv(proc_t *, char *, size_t, size_t *);
 extern	int	prwritectl(vnode_t *, struct uio *, cred_t *);
 extern	int	prlock(prnode_t *, int);
 extern	void	prunmark(proc_t *);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index 5591ffd89b..be41826b54 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -222,6 +222,7 @@ prchoose(proc_t *p)
 			case PR_SYSEXIT:
 			case PR_SIGNALLED:
 			case PR_FAULTED:
+			case PR_BRAND:
 				/*
 				 * Make an lwp calling exit() be the
 				 * last lwp seen in the process.
@@ -555,6 +556,12 @@ prexecend(void)
 			pcp->prc_tslot = tslot;
 		}
 	}
+
+	/*
+	 * There may be threads waiting for the flag change blocked behind the
+	 * pr_pid_cv as well.
+	 */
+	cv_signal(&pr_pid_cv[p->p_slot]);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index e535b1f647..39f8e6f01e 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017 by Delphix. All rights reserved.
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  * Copyright 2022 MNX Cloud, Inc.
@@ -171,8 +171,12 @@ static prdirent_t piddir[] = {
 		"contracts" },
 	{ PR_SECFLAGS,	28 * sizeof (prdirent_t), sizeof (prdirent_t),
 		"secflags" },
+	{ PR_ARGV,	29 * sizeof (prdirent_t), sizeof (prdirent_t),
+		"argv" },
+	{ PR_CMDLINE,	30 * sizeof (prdirent_t), sizeof (prdirent_t),
+		"cmdline" },
 #if defined(__x86)
-	{ PR_LDT,	29 * sizeof (prdirent_t), sizeof (prdirent_t),
+	{ PR_LDT,	31 * sizeof (prdirent_t), sizeof (prdirent_t),
 		"ldt" },
 #endif
 };
@@ -593,6 +597,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(),
 #if defined(__x86)
 	pr_read_ldt(),
 #endif
+	pr_read_argv(), pr_read_cmdline(),
 	pr_read_usage(), pr_read_lusage(), pr_read_pagedata(),
 	pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(),
 	pr_read_lwpusage(), pr_read_lwpname(),
@@ -623,6 +628,8 @@ static int (*pr_read_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
+	pr_read_cmdline,	/* /proc/<pid>/cmdline			*/
 	pr_read_usage,		/* /proc/<pid>/usage			*/
 	pr_read_lusage,		/* /proc/<pid>/lusage			*/
 	pr_read_pagedata,	/* /proc/<pid>/pagedata			*/
@@ -689,6 +696,76 @@ pr_uioread(void *base, long count, uio_t *uiop)
 }
 
 static int
+pr_read_cmdline(prnode_t *pnp, uio_t *uiop)
+{
+	char *args;
+	int error;
+	size_t asz = PRMAXARGVLEN, sz;
+
+	/*
+	 * Allocate a scratch buffer for collection of the process arguments.
+	 */
+	args = kmem_alloc(asz, KM_SLEEP);
+
+	ASSERT(pnp->pr_type == PR_CMDLINE);
+
+	if ((error = prlock(pnp, ZNO)) != 0) {
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	if ((error = prreadcmdline(pnp->pr_common->prc_proc, args, asz,
+	    &sz)) != 0) {
+		prunlock(pnp);
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	prunlock(pnp);
+
+	error = pr_uioread(args, sz, uiop);
+
+	kmem_free(args, asz);
+
+	return (error);
+}
+
+static int
+pr_read_argv(prnode_t *pnp, uio_t *uiop)
+{
+	char *args;
+	int error;
+	size_t asz = PRMAXARGVLEN, sz;
+
+	/*
+	 * Allocate a scratch buffer for collection of the process arguments.
+	 */
+	args = kmem_alloc(asz, KM_SLEEP);
+
+	ASSERT(pnp->pr_type == PR_ARGV);
+
+	if ((error = prlock(pnp, ZNO)) != 0) {
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz,
+	    &sz)) != 0) {
+		prunlock(pnp);
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	prunlock(pnp);
+
+	error = pr_uioread(args, sz, uiop);
+
+	kmem_free(args, asz);
+
+	return (error);
+}
+
+static int
 pr_read_as(prnode_t *pnp, uio_t *uiop)
 {
 	int error;
@@ -1913,6 +1990,8 @@ static int (*pr_read_function_32[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
+	pr_read_cmdline,	/* /proc/<pid>/cmdline			*/
 	pr_read_usage_32,	/* /proc/<pid>/usage			*/
 	pr_read_lusage_32,	/* /proc/<pid>/lusage			*/
 	pr_read_pagedata_32,	/* /proc/<pid>/pagedata			*/
@@ -2841,6 +2920,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 #endif
 }
 
+/*
+ * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile
+ * time that PRFNSZ has the same definition as MAXCOMLEN.
+ */
+#if PRFNSZ != MAXCOMLEN
+#error PRFNSZ/MAXCOMLEN mismatch
+#endif
+
+static int
+pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop)
+{
+	char fname[PRFNSZ];
+	int offset = offsetof(psinfo_t, pr_fname), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_fname);
+#endif
+
+	/*
+	 * If this isn't a write to pr_fname (or if the size doesn't match
+	 * PRFNSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ)
+		return (0);
+
+	if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	fname[PRFNSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+/*
+ * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile
+ * time that PRARGSZ has the same definition as PSARGSZ.
+ */
+#if PRARGSZ != PSARGSZ
+#error PRARGSZ/PSARGSZ mismatch
+#endif
+
+static int
+pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop)
+{
+	char psargs[PRARGSZ];
+	int offset = offsetof(psinfo_t, pr_psargs), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_psargs);
+#endif
+
+	/*
+	 * If this isn't a write to pr_psargs (or if the size doesn't match
+	 * PRARGSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ)
+		return (0);
+
+	if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	psargs[PRARGSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+int
+pr_write_psinfo(prnode_t *pnp, uio_t *uiop)
+{
+	int error;
+
+	if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0)
+		return (error);
+
+	if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0)
+		return (error);
+
+	return (0);
+}
+
+
 /* Note we intentionally don't handle partial writes/updates. */
 static int
 pr_write_lwpname(prnode_t *pnp, uio_t *uiop)
@@ -2967,6 +3143,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 			uiop->uio_resid = resid;
 		return (error);
 
+	case PR_PSINFO:
+		return (pr_write_psinfo(pnp, uiop));
+
 	case PR_LWPNAME:
 		return (pr_write_lwpname(pnp, uiop));
 
@@ -3296,6 +3475,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	case PR_AUXV:
 		vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t);
 		break;
+	case PR_ARGV:
+		if ((p->p_flag & SSYS) || p->p_as == &kas) {
+			vap->va_size = PSARGSZ;
+		} else {
+			vap->va_size = PRMAXARGVLEN;
+		}
+		break;
 #if defined(__x86)
 	case PR_LDT:
 		mutex_exit(&p->p_lock);
@@ -3418,6 +3604,7 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 #endif
 	case PR_CTL:
 	case PR_LWPCTL:
+	case PR_CMDLINE:
 	default:
 		vap->va_size = 0;
 		break;
@@ -3472,6 +3659,8 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
+	case PR_CMDLINE:
 		p = pr_p_lock(pnp);
 		mutex_exit(&pr_pidlock);
 		if (p == NULL)
@@ -3557,6 +3746,8 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_lookup_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_lookup_notdir,	/* /proc/<pid>/argv			*/
+	pr_lookup_notdir,	/* /proc/<pid>/cmdline			*/
 	pr_lookup_notdir,	/* /proc/<pid>/usage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/lusage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/pagedata			*/
@@ -4887,16 +5078,18 @@ prgetnode(vnode_t *dp, prnodetype_t type)
 		pnp->pr_mode = 0600;	/* read-write by owner only */
 		break;
 
+	case PR_PSINFO:
 	case PR_LWPNAME:
 		pnp->pr_mode = 0644;	/* readable by all + owner can write */
 		break;
 
-	case PR_PSINFO:
 	case PR_LPSINFO:
 	case PR_LWPSINFO:
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
+	case PR_CMDLINE:
 		pnp->pr_mode = 0444;	/* read-only by all */
 		break;
 
@@ -5004,6 +5197,8 @@ static int (*pr_readdir_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_readdir_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_readdir_notdir,	/* /proc/<pid>/argv			*/
+	pr_readdir_notdir,	/* /proc/<pid>/cmdline			*/
 	pr_readdir_notdir,	/* /proc/<pid>/usage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/lusage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/pagedata			*/
@@ -5157,6 +5352,8 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp)
 			case PR_PROCDIR:
 			case PR_PSINFO:
 			case PR_USAGE:
+			case PR_ARGV:
+			case PR_CMDLINE:
 				break;
 			default:
 				continue;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
index 01d382fed7..056619d90b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c
@@ -351,6 +351,7 @@ smb_kshare_g_fini(void)
 	kmem_cache_destroy(smb_kshare_cache_share);
 }
 
+
 /*
  * A list of shares in nvlist format can be sent down
  * from userspace thourgh the IOCTL interface. The nvlist
diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c
index 7f56792f7d..af12a0c30b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_server.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_server.c
@@ -897,6 +897,22 @@ smb_server_enum(smb_ioc_svcenum_t *ioc)
 	smb_svcenum_t	*svcenum = &ioc->svcenum;
 	smb_server_t	*sv;
 	int		rc;
+	uint32_t	buflen_adjusted;
+
+	/*
+	 * Reality check that the buffer-length insize the enum doesn't
+	 * overrun the ioctl's total length.
+	 *
+	 * NOTE: Assume se_buf is at the end of smb_svcenum_t.
+	 */
+	buflen_adjusted = svcenum->se_buflen +
+	    offsetof(smb_svcenum_t, se_buf) + sizeof (ioc->hdr);
+	if (buflen_adjusted < svcenum->se_buflen ||	/* Overflow check 1, */
+	    buflen_adjusted < offsetof(smb_svcenum_t, se_buf) || /* check 2, */
+	    buflen_adjusted < sizeof (ioc->hdr) ||	/* check 3. */
+	    buflen_adjusted > ioc->hdr.len) {
+		return (EINVAL);
+	}
 
 	/*
 	 * Reality check that the buffer-length insize the enum doesn't
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index b1f74b993b..768a001d72 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -129,7 +129,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr));
 
 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
 
@@ -279,7 +279,7 @@ so_connect(struct sonode *so, struct sockaddr *name,
 	 * This can happen if a non blocking operation caused an error.
 	 */
 
-	if (so->so_error != 0) {
+	if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 		mutex_enter(&so->so_lock);
 		error = sogeterr(so, B_TRUE);
 		mutex_exit(&so->so_lock);
@@ -378,7 +378,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 			break;
 		}
 
-		if (so->so_error != 0) {
+		if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 			mutex_enter(&so->so_lock);
 			error = sogeterr(so, B_TRUE);
 			mutex_exit(&so->so_lock);
@@ -487,7 +487,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag,
 			error = EPIPE;
 			break;
 		}
-		if (so->so_error != 0) {
+		if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 			mutex_enter(&so->so_lock);
 			error = sogeterr(so, B_TRUE);
 			mutex_exit(&so->so_lock);
@@ -622,7 +622,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
 
 	if (so->so_filter_active == 0 ||
 	    (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
@@ -671,7 +671,7 @@ so_getsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_getsockopt(so, option_name, optval, optlenp, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
 
 	if ((so->so_filter_active == 0 ||
@@ -760,7 +760,7 @@ so_setsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_setsockopt(so, option_name, optval, optlen, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
 
 	/* X/Open requires this check */
@@ -845,7 +845,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
 	 * If there is a pending error, return error
 	 * This can happen if a non blocking operation caused an error.
 	 */
-	if (so->so_error != 0) {
+	if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) {
 		mutex_enter(&so->so_lock);
 		error = sogeterr(so, B_TRUE);
 		mutex_exit(&so->so_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index ab9c479af3..df159a122c 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -671,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
 	int more = 0;
 	int error;
 	ssize_t oobmark;
+	ssize_t copied = 0;
 	sodirect_t *sodp = so->so_direct;
+	xuio_t *xuio = NULL;
 
 	partial_read = B_FALSE;
 	*mctlp = NULL;
+	if ((uiop->uio_extflg & UIO_XUIO) != 0) {
+		xuio = (xuio_t *)uiop;
+	}
 again:
 	mutex_enter(&so->so_lock);
 again1:
@@ -785,8 +790,6 @@ again1:
 		 * enabled socket, uio_resid can be 0.
 		 */
 		if (uiop->uio_resid >= 0) {
-			ssize_t copied = 0;
-
 			if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
 				mutex_enter(&so->so_lock);
 				ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
@@ -844,6 +847,18 @@ again1:
 		}
 		if (mp != NULL) { /* more data blocks in msg */
 			more |= MOREDATA;
+
+			/*
+			 * If requested, tally up remaining data along with the
+			 * amount already copied.
+			 */
+			if (xuio != NULL &&
+			    xuio->xu_type == UIOTYPE_PEEKSIZE) {
+				xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE;
+				xuio->xu_ext.xu_ps.xu_ps_size =
+				    copied + msgdsize(mp);
+			}
+
 			if ((flags & (MSG_PEEK|MSG_TRUNC))) {
 				if (flags & MSG_PEEK) {
 					freemsg(mp);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index 1fa4efe59f..62a079f419 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/systm.h>
@@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name,
 
 		/* Module loaded OK, so there must be an ops vector */
 		ASSERT(ent->sofe_mod != NULL);
+
+		/*
+		 * Check again to confirm ATTACH is ok. See if the the module
+		 * is not SOF_ATT_SAFE after an unsafe operation has taken
+		 * place.
+		 */
+		if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 &&
+		    so->so_state & SS_FILOP_UNSF) {
+			sof_instance_destroy(inst);
+			return (EINVAL);
+		}
+
 		inst->sofi_ops = &ent->sofe_mod->sofm_ops;
 
 		SOF_STAT_ADD(inst, tot_active_attach, 1);
@@ -1445,7 +1458,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
  * sof_register(version, name, ops, flags)
  *
  * Register a socket filter identified by name `name' and which should use
- * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * the ops vector `ops' for event notification. `flags' should be set to 0
+ * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An
+ * unsafe filter is one that cannot be attached after any socket operation has
+ * occured. This is the legacy default. A "safe" filter can be attached even
+ * after some basic initial socket operations have taken place. This set is
+ * currently bind, getsockname, getsockopt and setsockopt. The order in which
+ * a "safe" filter can be attached is more relaxed, and thus more flexible.
  * On success 0 is returned, otherwise an errno is returned.
  */
 int
@@ -1453,14 +1472,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
 {
 	sof_module_t *mod;
 
-	_NOTE(ARGUNUSED(flags));
-
 	if (version != SOF_VERSION)
 		return (EINVAL);
 
 	mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
 	mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 	(void) strcpy(mod->sofm_name, name);
+	mod->sofm_flags = flags;
 	mod->sofm_ops = *ops;
 
 	mutex_enter(&sof_module_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index e9a09bad88..e63831e172 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SOCKFS_SOCKFILTER_H
@@ -51,6 +52,7 @@ typedef struct sof_kstat	sof_kstat_t;
 
 struct sof_module {
 	char		*sofm_name;
+	int		sofm_flags;
 	sof_ops_t	sofm_ops;
 	uint_t		sofm_refcnt;
 	list_node_t	sofm_node;
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 3262150f79..739d439851 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2015, Joyent, Inc. All rights reserved.
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
@@ -420,8 +421,10 @@ sogetoff(mblk_t *mp, t_uscalar_t offset,
  *
  * The underlying filesystem VSOCK vnode has a v_stream pointer that
  * references the actual stream head (hence indirectly the actual sonode).
+ *
+ * This function is non-static so it can be used by brand emulation.
  */
-static int
+int
 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
     vnode_t **vpp)
 {
diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
index 6a515be122..24acb81a0a 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef _SOCKFS_SOCKTPI_IMPL_H
@@ -56,6 +57,8 @@ extern int	sogetrderr(vnode_t *, int, int *);
 extern int	sogetwrerr(vnode_t *, int, int *);
 extern int	so_addr_verify(struct sonode *, const struct sockaddr *,
 			socklen_t);
+extern int	so_ux_lookup(struct sonode *, struct sockaddr_un *, int,
+		vnode_t **);
 extern int	so_ux_addr_xlate(struct sonode *, struct sockaddr *,
 			socklen_t, int, void **, socklen_t *);
 extern void	so_unix_close(struct sonode *);
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
 		 * memory that can be used as swap space should do so by
 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
 		 * However, swapfs_minfree is tunable by install as a
-		 * workaround for bugid 1147463.
+		 * workaround for bugid 1147463. Note swapfs_minfree is set
+		 * to 1/8th of memory, but clamped at the limit of 256 MB.
 		 */
-		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+		new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+		    btopr(256 * 1024 * 1024));
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index 06ef8dd7fd..b28ced7111 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -21,10 +21,9 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
@@ -445,20 +444,7 @@ tdirenter(
 				/*
 				 * Unmake the inode we just made.
 				 */
-				rw_enter(&tp->tn_rwlock, RW_WRITER);
-				if ((tp->tn_type) == VDIR) {
-					ASSERT(tdp == NULL);
-					/*
-					 * cleanup allocs made by tdirinit()
-					 */
-					tdirtrunc(tp);
-				}
-				mutex_enter(&tp->tn_tlock);
-				tp->tn_nlink = 0;
-				mutex_exit(&tp->tn_tlock);
-				gethrestime(&tp->tn_ctime);
-				rw_exit(&tp->tn_rwlock);
-				tmpnode_rele(tp);
+				tmpnode_cleanup(tp);
 				tp = NULL;
 			}
 		} else if (tpp) {
@@ -493,6 +479,7 @@ tdirdelete(
 	enum dr_op op,
 	struct cred *cred)
 {
+	struct tmount *tm;
 	struct tdirent *tpdp;
 	int error;
 	size_t namelen;
@@ -578,7 +565,8 @@ tdirdelete(
 	 */
 	namelen = strlen(tpdp->td_name) + 1;
 
-	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
+	tm = TNTOTM(dir);
+	tmp_kmem_free(tm, tpdp, sizeof (struct tdirent) + namelen);
 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
 	dir->tn_dirents--;
 
@@ -600,19 +588,27 @@ tdirdelete(
  * tdirinit is used internally to initialize a directory (dir)
  * with '.' and '..' entries without checking permissions and locking
  */
-void
+int
 tdirinit(
 	struct tmpnode *parent,		/* parent of directory to initialize */
 	struct tmpnode *dir)		/* the new directory */
 {
+	struct tmount *tm;
 	struct tdirent *dot, *dotdot;
 	timestruc_t now;
 
 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
 	ASSERT(dir->tn_type == VDIR);
 
-	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
-	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
+	tm = TNTOTM(parent);
+	dot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 2, KM_SLEEP);
+	if (dot == NULL)
+		return (ENOSPC);
+	dotdot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 3, KM_SLEEP);
+	if (dotdot == NULL) {
+		tmp_kmem_free(tm, dot, sizeof (struct tdirent) + 2);
+		return (ENOSPC);
+	}
 
 	/*
 	 * Initialize the entries
@@ -663,6 +659,8 @@ tdirinit(
 	dir->tn_size = 2 * sizeof (struct tdirent) + 5;	/* dot and dotdot */
 	dir->tn_dirents = 2;
 	dir->tn_nlink = 2;
+
+	return (0);
 }
 
 
@@ -674,6 +672,7 @@ tdirtrunc(struct tmpnode *dir)
 {
 	struct tdirent *tdp;
 	struct tmpnode *tp;
+	struct tmount *tm;
 	size_t namelen;
 	timestruc_t now;
 	int isvattrdir, isdotdot, skip_decr;
@@ -681,6 +680,8 @@ tdirtrunc(struct tmpnode *dir)
 	ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
 	ASSERT(dir->tn_type == VDIR);
 
+	tm = TNTOTM(dir);
+
 	isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
 	for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
 		ASSERT(tdp->td_next != tdp);
@@ -712,7 +713,7 @@ tdirtrunc(struct tmpnode *dir)
 
 		tmpfs_hash_out(tdp);
 
-		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
+		tmp_kmem_free(tm, tdp, sizeof (struct tdirent) + namelen);
 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
 		dir->tn_dirents--;
 	}
@@ -965,6 +966,7 @@ tdiraddentry(
 	enum de_op	op,
 	struct tmpnode	*fromtp)
 {
+	struct tmount *tm;
 	struct tdirent *tdp, *tpdp;
 	size_t		namelen, alloc_size;
 	timestruc_t	now;
@@ -985,9 +987,10 @@ tdiraddentry(
 	/*
 	 * Allocate and initialize directory entry
 	 */
+	tm = TNTOTM(dir);
 	namelen = strlen(name) + 1;
 	alloc_size = namelen + sizeof (struct tdirent);
-	tdp = tmp_memalloc(alloc_size, 0);
+	tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP_LAZY);
 	if (tdp == NULL)
 		return (ENOSPC);
 
@@ -1087,7 +1090,10 @@ tdirmaketnode(
 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
 		return (EOVERFLOW);
 	type = va->va_type;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
+	if (tp == NULL) {
+		return (ENOSPC);
+	}
 	tmpnode_init(tm, tp, va, cred);
 
 	/* setup normal file/dir's extended attribute directory */
@@ -1149,8 +1155,13 @@ tdirmaketnode(
 	if (va->va_mask & AT_MTIME)
 		tp->tn_mtime = va->va_mtime;
 
-	if (op == DE_MKDIR)
-		tdirinit(dir, tp);
+	if (op == DE_MKDIR) {
+		int ret;
+		if ((ret = tdirinit(dir, tp)) != 0) {
+			tmpnode_cleanup(tp);
+			return (ret);
+		}
+	}
 
 	*newnode = tp;
 	return (0);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
index 8723631555..0c48c03a75 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -43,6 +43,7 @@
 #include <sys/fs/tmpnode.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
+#include <vm/anon.h>
 
 #define	KILOBYTE	1024
 #define	MEGABYTE	(1024 * KILOBYTE)
@@ -54,6 +55,80 @@
 
 extern pgcnt_t swapfs_minfree;
 
+void *
+tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag)
+{
+	void *buf;
+	zone_t *zone;
+	size_t pages;
+
+	mutex_enter(&tm->tm_contents);
+	zone = tm->tm_vfsp->vfs_zone;
+	if (tm->tm_anonmem + size > tm->tm_anonmax ||
+	    tm->tm_anonmem + size < tm->tm_anonmem ||
+	    size + ptob(tmpfs_minfree) <= size ||
+	    !anon_checkspace(size + ptob(tmpfs_minfree), zone)) {
+		mutex_exit(&tm->tm_contents);
+		return (NULL);
+	}
+
+	/*
+	 * Only make anonymous memory reservations when a page boundary is
+	 * crossed.  This is necessary since the anon_resv functions rounds up
+	 * to PAGESIZE internally.
+	 */
+	pages = btopr(tm->tm_allocmem + size);
+	pages -= btopr(tm->tm_allocmem);
+	if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) {
+		mutex_exit(&tm->tm_contents);
+		return (NULL);
+	}
+
+	tm->tm_allocmem += size;
+	tm->tm_anonmem += size;
+	mutex_exit(&tm->tm_contents);
+
+	buf = kmem_zalloc(size, flag);
+	if (buf == NULL) {
+		mutex_enter(&tm->tm_contents);
+		ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
+		tm->tm_anonmem -= size;
+		if (pages > 0) {
+			/*
+			 * Re-chasing the zone pointer is necessary since a
+			 * forced umount could have been performed while the
+			 * tm_contents lock was dropped during allocation.
+			 */
+			anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone);
+		}
+		mutex_exit(&tm->tm_contents);
+	}
+
+	return (buf);
+}
+
+void
+tmp_kmem_free(struct tmount *tm, void *buf, size_t size)
+{
+	size_t pages;
+
+	kmem_free(buf, size);
+	mutex_enter(&tm->tm_contents);
+	ASSERT(tm->tm_anonmem > tm->tm_anonmem - size);
+	tm->tm_anonmem -= size;
+	pages = btopr(tm->tm_allocmem);
+	tm->tm_allocmem -= size;
+	pages -= btopr(tm->tm_allocmem);
+	/*
+	 * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when
+	 * a page boundary has been crossed.
+	 */
+	if (pages > 0) {
+		anon_unresv_zone(size, tm->tm_vfsp->vfs_zone);
+	}
+	mutex_exit(&tm->tm_contents);
+}
+
 int
 tmp_taccess(void *vtp, int mode, struct cred *cred)
 {
@@ -99,42 +174,8 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
 }
 
 /*
- * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
- * or the 'musthave' flag is set.  'musthave' allocations should
- * always be subordinate to normal allocations so that tmpfs_maxkmem
- * can't be exceeded by more than a few KB.  Example: when creating
- * a new directory, the tmpnode is a normal allocation; if that
- * succeeds, the dirents for "." and ".." are 'musthave' allocations.
- */
-void *
-tmp_memalloc(size_t size, int musthave)
-{
-	static time_t last_warning;
-	time_t now;
-
-	if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
-	    musthave)
-		return (kmem_zalloc(size, KM_SLEEP));
-
-	atomic_add_long(&tmp_kmemspace, -size);
-	now = gethrestime_sec();
-	if (last_warning != now) {
-		last_warning = now;
-		cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
-	}
-	return (NULL);
-}
-
-void
-tmp_memfree(void *cp, size_t size)
-{
-	kmem_free(cp, size);
-	atomic_add_long(&tmp_kmemspace, -size);
-}
-
-/*
- * Convert a string containing a number (number of bytes) to a pgcnt_t,
- * containing the corresponding number of pages. On 32-bit kernels, the
+ * Convert a string containing a number (number of bytes) to a size_t,
+ * containing the corresponding number of bytes. On 32-bit kernels, the
  * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
  * returned in 'maxpg' is at most ULONG_MAX.
  *
@@ -152,7 +193,7 @@ tmp_memfree(void *cp, size_t size)
  * error.
  */
 int
-tmp_convnum(char *str, pgcnt_t *maxpg)
+tmp_convnum(char *str, size_t *maxbytes)
 {
 	u_longlong_t num = 0;
 #ifdef _LP64
@@ -160,6 +201,7 @@ tmp_convnum(char *str, pgcnt_t *maxpg)
 #else
 	u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
 #endif
+	size_t pages;
 	char *c;
 	const struct convchar {
 		char *cc_char;
@@ -250,13 +292,21 @@ valid_char:
 
 done:
 	/*
-	 * Since btopr() rounds up to page granularity, this round-up can
-	 * cause an overflow only if 'num' is between (max_bytes - PAGESIZE)
-	 * and (max_bytes). In this case the resulting number is zero, which
-	 * is what we check for below.
+	 * We've been given a size in bytes; however, we want to make sure that
+	 * we have at least one page worth no matter what. Therefore we use
+	 * btopr to round up. However, this may cause an overflow only if 'num'
+	 * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the
+	 * resulting number is zero, which is what we check for below. Note, we
+	 * require at least one page, so if pages is zero, well, it wasn't going
+	 * to work anyways.
 	 */
-	if ((*maxpg = (pgcnt_t)btopr(num)) == 0 && num != 0)
+	pages = btopr(num);
+	if (pages == 0) {
 		return (EINVAL);
+	}
+
+	*maxbytes = ptob(pages);
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
index 51e57b2611..13ea356924 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -64,21 +65,35 @@ tmp_resv(
 	int pagecreate)		/* call anon_resv if set */
 {
 	pgcnt_t pages = btopr(delta);
+	size_t pbytes = ptob(pages);
 	zone_t *zone;
 
 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 	ASSERT(tp->tn_type == VREG);
+
 	/*
-	 * pagecreate is set only if we actually need to call anon_resv
-	 * to reserve an additional page of anonymous memory.
-	 * Since anon_resv always reserves a page at a time,
-	 * it should only get called when we know we're growing the
-	 * file into a new page or filling a hole.
+	 * pagecreate is set only if we actually need to call anon_resv to
+	 * reserve an additional page of anonymous memory.  Since anon_resv
+	 * always reserves a page at a time, it should only get called when we
+	 * know we're growing the file into a new page or filling a hole. This
+	 * is why we transform delta into a number of pages. However, because we
+	 * track bytes and not pages, we convert that back to a number of bytes
+	 * that we allocate against.
 	 *
-	 * Deny if trying to reserve more than tmpfs can allocate
+	 * Deny if trying to reserve more than tmpfs can allocate, the
+	 * allocation causes an overflow, or the delta round up overflowed.
+	 * Note, that btopr rounds up, so we need to catch the unsigned
+	 * overflow. Note, rounding up when we are within a page of SIZE_MAX is
+	 * done by adding a page, overflowing, which will then be rounded back
+	 * to zero. Hence the following check.
 	 */
+	if (pages == 0 && delta != 0)
+		return (1);
+
 	zone = tm->tm_vfsp->vfs_zone;
-	if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) ||
+	if (pagecreate && ((tm->tm_anonmem + pbytes > tm->tm_anonmax) ||
+	    (tm->tm_anonmem + pbytes < tm->tm_anonmem) ||
+	    (ptob(pages + tmpfs_minfree) <= pbytes) ||
 	    (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) ||
 	    (anon_try_resv_zone(delta, zone) == 0))) {
 		return (1);
@@ -89,7 +104,7 @@ tmp_resv(
 	 */
 	if (pagecreate) {
 		mutex_enter(&tm->tm_contents);
-		tm->tm_anonmem += pages;
+		tm->tm_anonmem += pbytes;
 		mutex_exit(&tm->tm_contents);
 
 		TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu",
@@ -110,13 +125,27 @@ tmp_unresv(
 	struct tmpnode *tp,
 	size_t delta)
 {
+	size_t pages, pbytes;
+
 	ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 	ASSERT(tp->tn_type == VREG);
 
+	/*
+	 * If this is true, we have a grevious overflow bug and some size
+	 * accounting has been messed with as having an amount to truncate at
+	 * this size would imply that all of memory was used for this file. No
+	 * matter how small the kernel, it will always need at least one page.
+	 */
+	pages = btopr(delta);
+	if (pages == 0 && delta != 0)
+		panic("tmpfs unsigned overflow detected");
+	pbytes = ptob(pages);
+
 	anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone);
 
 	mutex_enter(&tm->tm_contents);
-	tm->tm_anonmem -= btopr(delta);
+	ASSERT(tm->tm_anonmem > tm->tm_anonmem - pbytes);
+	tm->tm_anonmem -= pbytes;
 	mutex_exit(&tm->tm_contents);
 
 	TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", tp, delta);
@@ -154,6 +183,26 @@ tmpnode_growmap(struct tmpnode *tp, ulong_t newsize)
 }
 
 /*
+ * This is used to clean up a tmpnode that hasn't made it out the door. In other
+ * words, we allocated it and did a tmpnode_init; however, before it could get
+ * fully inserted into a directory, bad things happened and it failed.
+ */
+void
+tmpnode_cleanup(struct tmpnode *tp)
+{
+	rw_enter(&tp->tn_rwlock, RW_WRITER);
+	if ((tp->tn_type) == VDIR) {
+		tdirtrunc(tp);
+	}
+	mutex_enter(&tp->tn_tlock);
+	tp->tn_nlink = 0;
+	mutex_exit(&tp->tn_tlock);
+	gethrestime(&tp->tn_ctime);
+	rw_exit(&tp->tn_rwlock);
+	tmpnode_rele(tp);
+}
+
+/*
  * Initialize a tmpnode and add it to file list under mount point.
  */
 void
@@ -232,7 +281,6 @@ tmpnode_trunc(
 {
 	size_t oldsize = tp->tn_size;
 	size_t delta;
-	struct vnode *vp = TNTOV(tp);
 	timestruc_t now;
 	int error = 0;
 
@@ -316,7 +364,7 @@ tmpnode_trunc(
 			/* Delete anon array for tmpnode */
 			ASSERT(tp->tn_nblocks == 0);
 			ASSERT(anon_get_ptr(tp->tn_anon, 0) == NULL);
-			ASSERT(!vn_has_cached_data(vp));
+			ASSERT(!vn_has_cached_data(TNTOV(tp)));
 
 			anon_release(tp->tn_anon, tp->tn_asize);
 			tp->tn_anon = NULL;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index a7cf62cb99..24310fefe5 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -56,6 +56,15 @@
 static int tmpfsfstype;
 
 /*
+ * tmpfs_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. With forced umount support, the
+ * filesystem module must not be allowed to go away before the last
+ * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
+ * there's no need for locking.
+ */
+static uint32_t	tmpfs_mountcount;
+
+/*
  * tmpfs vfs operations.
  */
 static int tmpfsinit(int, char *);
@@ -65,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *);
 static int tmp_root(struct vfs *, struct vnode **);
 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
+static void tmp_freevfs(vfs_t *vfsp);
 
 /*
  * Loadable module wrapper
@@ -123,6 +133,14 @@ _fini()
 {
 	int error;
 
+	/*
+	 * If a forceably unmounted instance is still hanging around, we cannot
+	 * allow the module to be unloaded because that would cause panics once
+	 * the VFS framework decides it's time to call into VFS_FREEVFS().
+	 */
+	if (tmpfs_mountcount)
+		return (EBUSY);
+
 	error = mod_remove(&modlinkage);
 	if (error)
 		return (error);
@@ -141,14 +159,6 @@ _info(struct modinfo *modinfop)
 }
 
 /*
- * The following are patchable variables limiting the amount of system
- * resources tmpfs can use.
- *
- * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
- * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
- * It is not determined by setting a hard limit but rather as a percentage of
- * physical memory which is determined when tmpfs is first used in the system.
- *
  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
  * the rest of the system.  In other words, if the amount of free swap space
  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
@@ -157,9 +167,7 @@ _info(struct modinfo *modinfop)
  * There is also a per mount limit on the amount of swap space
  * (tmount.tm_anonmax) settable via a mount option.
  */
-size_t tmpfs_maxkmem = 0;
 size_t tmpfs_minfree = 0;
-size_t tmp_kmemspace;		/* bytes of kernel heap used by all tmpfs */
 
 static major_t tmpfs_major;
 static minor_t tmpfs_minor;
@@ -178,6 +186,7 @@ tmpfsinit(int fstype, char *name)
 		VFSNAME_ROOT,		{ .vfs_root = tmp_root },
 		VFSNAME_STATVFS,	{ .vfs_statvfs = tmp_statvfs },
 		VFSNAME_VGET,		{ .vfs_vget = tmp_vget },
+		VFSNAME_FREEVFS,	{ .vfs_freevfs = tmp_freevfs },
 		NULL,			NULL
 	};
 	int error;
@@ -212,18 +221,12 @@ tmpfsinit(int fstype, char *name)
 		tmpfs_minfree = btopr(TMPMINFREE);
 	}
 
-	/*
-	 * The maximum amount of space tmpfs can allocate is
-	 * TMPMAXPROCKMEM percent of kernel memory
-	 */
-	if (tmpfs_maxkmem == 0)
-		tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
-
 	if ((tmpfs_major = getudev()) == (major_t)-1) {
 		cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
 		tmpfs_major = 0;
 	}
 	mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	tmpfs_mountcount = 0;
 	return (0);
 }
 
@@ -234,7 +237,7 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	struct tmpnode *tp;
 	struct pathname dpn;
 	int error;
-	pgcnt_t anonmax;
+	size_t anonmax;
 	struct vattr rattr;
 	int got_attrs;
 	boolean_t mode_arg = B_FALSE;
@@ -278,7 +281,18 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		if ((error = tmp_convnum(argstr, &anonmax)) != 0)
 			goto out;
 	} else {
-		anonmax = ULONG_MAX;
+		anonmax = SIZE_MAX;
+	}
+
+	/*
+	 * The "mode" mount argument allows the operator to override the
+	 * permissions of the root of the tmpfs mount.
+	 */
+	if (vfs_optionisset(vfsp, "mode", &argstr)) {
+		if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
+			goto out;
+		}
+		mode_arg = B_TRUE;
 	}
 
 	/*
@@ -311,7 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		goto out;
 	}
 
-	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
+	if ((tm = kmem_zalloc(sizeof (struct tmount), KM_NOSLEEP_LAZY)) ==
+	    NULL) {
 		pn_free(&dpn);
 		error = ENOMEM;
 		goto out;
@@ -343,17 +358,37 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	vfsp->vfs_bsize = PAGESIZE;
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
-	tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
+	tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
 	(void) strcpy(tm->tm_mntpath, dpn.pn_path);
 
 	/*
+	 * Preemptively set vfs_zone before any of the tmp_kmem_* functions are
+	 * called.  That field is not populated until after a successful
+	 * VFS_MOUNT when domount() sets vfsp metadata via vfs_add().  An
+	 * accurate value is required for proper swap usage accounting.
+	 */
+	ASSERT0(uap->flags & MS_REMOUNT);
+	ASSERT(vfsp->vfs_zone == NULL);
+	vfsp->vfs_zone = curproc->p_zone;
+
+	/*
 	 * allocate and initialize root tmpnode structure
 	 */
 	bzero(&rattr, sizeof (struct vattr));
 	rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
 	rattr.va_type = VDIR;
 	rattr.va_rdev = 0;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
+	if (tp == NULL) {
+		kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+		mutex_destroy(&tm->tm_contents);
+		mutex_destroy(&tm->tm_renamelck);
+		kmem_free(tm, sizeof (struct tmount));
+
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
 	tmpnode_init(tm, tp, &rattr, cr);
 
 	/*
@@ -392,12 +427,34 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	tp->tn_nlink = 0;
 	tm->tm_rootnode = tp;
 
-	tdirinit(tp, tp);
+	if (tdirinit(tp, tp) != 0) {
+		/*
+		 * While we would normally let our VOP_INACTIVE function take
+		 * care of cleaning up here, we're in a bit of a delicate
+		 * situation, so we do so manually. While it's tempting to try
+		 * and rely upon tmpfs_freevfs() and others, it's probably safer
+		 * for the time to do this manually at the cost of duplication.
+		 */
+		vn_invalid(TNTOV(tp));
+		rw_destroy(&tp->tn_rwlock);
+		mutex_destroy(&tp->tn_tlock);
+		vn_free(TNTOV(tp));
+		tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
+
+		kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+		mutex_destroy(&tm->tm_contents);
+		mutex_destroy(&tm->tm_renamelck);
+		kmem_free(tm, sizeof (struct tmount));
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
 
 	rw_exit(&tp->tn_rwlock);
 
 	pn_free(&dpn);
 	error = 0;
+	atomic_inc_32(&tmpfs_mountcount);
 
 out:
 	if (error == 0)
@@ -413,36 +470,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	struct tmpnode *tnp, *cancel;
 	struct vnode	*vp;
 	int error;
+	uint_t cnt;
+	int i;
 
 	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (error);
 
-	/*
-	 * forced unmount is not supported by this file system
-	 * and thus, ENOTSUP, is being returned.
-	 */
-	if (flag & MS_FORCE)
-		return (ENOTSUP);
-
 	mutex_enter(&tm->tm_contents);
 
 	/*
-	 * If there are no open files, only the root node should have
-	 * a reference count.
+	 * In the normal unmount case (non-forced unmount), if there are no
+	 * open files, only the root node should have a reference count.
+	 *
 	 * With tm_contents held, nothing can be added or removed.
 	 * There may be some dirty pages.  To prevent fsflush from
 	 * disrupting the unmount, put a hold on each node while scanning.
 	 * If we find a previously referenced node, undo the holds we have
 	 * placed and fail EBUSY.
+	 *
+	 * However, in the case of a forced umount, things are a bit different.
+	 * An additional VFS_HOLD is added for each outstanding VN_HOLD to
+	 * ensure that the file system is not cleaned up (tmp_freevfs) until
+	 * the last vfs hold is dropped. This happens in tmp_inactive as the
+	 * vnodes are released. Also, we can't add an additional VN_HOLD in
+	 * this case since that would prevent tmp_inactive from ever being
+	 * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
+	 * so that the zone is not blocked waiting for the final file system
+	 * cleanup.
 	 */
 	tnp = tm->tm_rootnode;
-	if (TNTOV(tnp)->v_count > 1) {
+
+	vp = TNTOV(tnp);
+	mutex_enter(&vp->v_lock);
+	cnt = vp->v_count;
+	if (flag & MS_FORCE) {
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		/* Extra hold which we rele below when we drop the zone ref */
+		VFS_HOLD(vfsp);
+
+		for (i = 1; i < cnt; i++)
+			VFS_HOLD(vfsp);
+
+		/* drop the mutex now because no one can find this mount */
+		mutex_exit(&tm->tm_contents);
+	} else if (cnt > 1) {
+		mutex_exit(&vp->v_lock);
 		mutex_exit(&tm->tm_contents);
 		return (EBUSY);
 	}
+	mutex_exit(&vp->v_lock);
 
+	/*
+	 * Check for open files. An open file causes everything to unwind
+	 * unless this is a forced umount.
+	 */
 	for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
-		if ((vp = TNTOV(tnp))->v_count > 0) {
+		vp = TNTOV(tnp);
+		mutex_enter(&vp->v_lock);
+		cnt = vp->v_count;
+		if (flag & MS_FORCE) {
+			for (i = 0; i < cnt; i++)
+				VFS_HOLD(vfsp);
+
+			/*
+			 * In the case of a forced umount don't add an
+			 * additional VN_HOLD on the already held vnodes, like
+			 * we do in the non-forced unmount case. If the
+			 * cnt > 0, then the vnode already has at least one
+			 * hold and we need tmp_inactive to get called when the
+			 * last pre-existing hold on the node is released so
+			 * that we can VFS_RELE the VFS holds we just added.
+			 */
+			if (cnt == 0) {
+				/* directly add VN_HOLD since have the lock */
+				vp->v_count++;
+			}
+
+			mutex_exit(&vp->v_lock);
+
+			/*
+			 * If the tmpnode has any pages associated with it
+			 * (i.e. if it's a normal file with non-zero size), the
+			 * tmpnode could still be discovered by pageout or
+			 * fsflush via the page vnode pointers. To prevent this
+			 * from interfering with the tmp_freevfs, truncate the
+			 * tmpnode now.
+			 */
+			if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
+				rw_enter(&tnp->tn_rwlock, RW_WRITER);
+				rw_enter(&tnp->tn_contents, RW_WRITER);
+
+				(void) tmpnode_trunc(tm, tnp, 0);
+
+				rw_exit(&tnp->tn_contents);
+				rw_exit(&tnp->tn_rwlock);
+
+				ASSERT(tnp->tn_size == 0);
+				ASSERT(tnp->tn_nblocks == 0);
+			}
+		} else if (cnt > 0) {
+			/* An open file; unwind the holds we've been adding. */
+			mutex_exit(&vp->v_lock);
 			cancel = tm->tm_rootnode->tn_forw;
 			while (cancel != tnp) {
 				vp = TNTOV(cancel);
@@ -452,14 +580,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 			}
 			mutex_exit(&tm->tm_contents);
 			return (EBUSY);
+		} else {
+			/* directly add a VN_HOLD since we have the lock */
+			vp->v_count++;
+			mutex_exit(&vp->v_lock);
 		}
-		VN_HOLD(vp);
 	}
 
-	/*
-	 * We can drop the mutex now because no one can find this mount
-	 */
-	mutex_exit(&tm->tm_contents);
+	if (flag & MS_FORCE) {
+		/*
+		 * Drop the zone ref now since we don't know how long it will
+		 * be until the final vfs_rele is called by tmp_inactive.
+		 */
+		if (vfsp->vfs_zone) {
+			zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
+			    ZONE_REF_VFS);
+			vfsp->vfs_zone = 0;
+		}
+		/* We can now drop the extra hold we added above. */
+		VFS_RELE(vfsp);
+	} else {
+		/*
+		 * For the non-forced case, we can drop the mutex now because
+		 * no one can find this mount anymore
+		 */
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		mutex_exit(&tm->tm_contents);
+	}
+
+	return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
+ * the vfs framework after umount and the last VFS_RELE, to trigger the release
+ * of any resources still associated with the given vfs_t. We only add
+ * additional VFS_HOLDs during the forced umount case, so this is normally
+ * called immediately after tmp_umount.
+ */
+void
+tmp_freevfs(vfs_t *vfsp)
+{
+	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
+	struct tmpnode *tnp;
+	struct vnode	*vp;
 
 	/*
 	 * Free all kmemalloc'd and anonalloc'd memory associated with
@@ -469,6 +633,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	 * tmpnode_free which assumes that the directory entry has been
 	 * removed before the file.
 	 */
+
+	/*
+	 * Now that we are tearing ourselves down we need to remove the
+	 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+	 * files from the system causing us to have a negative value. Doing this
+	 * seems a bit better than trying to set a flag on the tmount that says
+	 * we're tearing down.
+	 */
+	vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
 	/*
 	 * Remove all directory entries
 	 */
@@ -535,15 +709,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 
 	ASSERT(tm->tm_mntpath);
 
-	tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+	kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 
 	ASSERT(tm->tm_anonmem == 0);
 
 	mutex_destroy(&tm->tm_contents);
 	mutex_destroy(&tm->tm_renamelck);
-	tmp_memfree(tm, sizeof (struct tmount));
+	kmem_free(tm, sizeof (struct tmount));
 
-	return (0);
+	/* Allow _fini() to succeed now */
+	atomic_dec_32(&tmpfs_mountcount);
 }
 
 /*
@@ -605,18 +780,19 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 	 * If tm_anonmax for this mount is less than the available swap space
 	 * (minus the amount tmpfs can't use), use that instead
 	 */
-	if (blocks > tmpfs_minfree)
+	if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) {
 		sbp->f_bfree = MIN(blocks - tmpfs_minfree,
-		    tm->tm_anonmax - tm->tm_anonmem);
-	else
+		    btop(tm->tm_anonmax) - btopr(tm->tm_anonmem));
+	} else {
 		sbp->f_bfree = 0;
+	}
 
 	sbp->f_bavail = sbp->f_bfree;
 
 	/*
 	 * Total number of blocks is what's available plus what's been used
 	 */
-	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem);
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem));
 
 	if (eff_zid != GLOBAL_ZONEUNIQID &&
 	    zp->zone_max_swap_ctl != UINT64_MAX) {
@@ -646,13 +822,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 	 * available to tmpfs.  This is fairly inaccurate since it doesn't
 	 * take into account the names stored in the directory entries.
 	 */
-	if (tmpfs_maxkmem > tmp_kmemspace)
-		sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
-		    (sizeof (struct tmpnode) + sizeof (struct tdirent));
-	else
-		sbp->f_ffree = 0;
-
-	sbp->f_files = tmpfs_maxkmem /
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
 	    (sizeof (struct tmpnode) + sizeof (struct tdirent));
 	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
 	(void) cmpldev(&d32, vfsp->vfs_dev);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index a09f206d88..cbe19aefea 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2016 RackTop Systems.
  * Copyright (c) 2017 by Delphix. All rights reserved.
@@ -586,6 +586,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support reading non-regular files
 	 */
@@ -615,6 +619,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support writing to non-regular files
 	 */
@@ -788,8 +796,13 @@ tmp_setattr(
 		rw_exit(&tp->tn_contents);
 		rw_exit(&tp->tn_rwlock);
 
-		if (error == 0 && vap->va_size == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (vap->va_size == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 
 		goto out1;
 	}
@@ -835,6 +848,9 @@ tmp_lookup(
 	struct tmpnode *ntp = NULL;
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
 
 	/* allow cd into @ dir */
 	if (flags & LOOKUP_XATTR) {
@@ -853,6 +869,8 @@ tmp_lookup(
 
 		rw_enter(&tp->tn_rwlock, RW_WRITER);
 		if (tp->tn_xattrdp == NULL) {
+			int err;
+
 			if (!(flags & CREATE_XATTR_DIR)) {
 				rw_exit(&tp->tn_rwlock);
 				return (ENOENT);
@@ -873,9 +891,13 @@ tmp_lookup(
 				return (error);
 			}
 
-			xdp = tmp_memalloc(sizeof (struct tmpnode),
-			    TMP_MUSTHAVE);
 			tm = VTOTM(dvp);
+			xdp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode),
+			    KM_SLEEP);
+			if (xdp == NULL) {
+				rw_exit(&tp->tn_rwlock);
+				return (ENOSPC);
+			}
 			tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
 			/*
 			 * Fix-up fields unique to attribute directories.
@@ -893,7 +915,16 @@ tmp_lookup(
 			}
 			xdp->tn_vnode->v_type = VDIR;
 			xdp->tn_vnode->v_flag |= V_XATTRDIR;
-			tdirinit(tp, xdp);
+			if ((err = tdirinit(tp, xdp)) != 0) {
+				rw_exit(&tp->tn_rwlock);
+				/*
+				 * This never got properly initialized so we can
+				 * just clean it up.
+				 */
+				xdp->tn_vnode->v_flag &= V_XATTRDIR;
+				tmpnode_cleanup(tp);
+				return (err);
+			}
 			tp->tn_xattrdp = xdp;
 		} else {
 			VN_HOLD(tp->tn_xattrdp->tn_vnode);
@@ -1302,10 +1333,8 @@ tmp_rename(
 		vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
 		/*
 		 * vnevent_rename_dest is called in tdirenter().
-		 * Notify the target dir if not same as source dir.
 		 */
-		if (ndvp != odvp)
-			vnevent_rename_dest_dir(ndvp, ct);
+		vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
 	}
 
 done:
@@ -1474,6 +1503,10 @@ tmp_readdir(
 	int reclen;
 	caddr_t outbuf;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (uiop->uio_loffset >= MAXOFF_T) {
 		if (eofp)
 			*eofp = 1;
@@ -1607,12 +1640,12 @@ tmp_symlink(
 	rw_exit(&parent->tn_rwlock);
 
 	if (error) {
-		if (self)
+		if (self != NULL)
 			tmpnode_rele(self);
 		return (error);
 	}
 	len = strlen(tnm) + 1;
-	cp = tmp_memalloc(len, 0);
+	cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP_LAZY);
 	if (cp == NULL) {
 		tmpnode_rele(self);
 		return (ENOSPC);
@@ -1677,10 +1710,27 @@ top:
 	 * there's little to do -- just drop our hold.
 	 */
 	if (vp->v_count > 1 || tp->tn_nlink != 0) {
-		VN_RELE_LOCKED(vp);
+		if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
+			/*
+			 * Since the file system was forcibly unmounted, we can
+			 * have a case (v_count == 1, tn_nlink != 0) where this
+			 * file was open so we didn't add an extra hold on the
+			 * file in tmp_unmount. We are counting on the
+			 * interaction of the hold made in tmp_unmount and
+			 * rele-ed in tmp_vfsfree so we need to be sure we
+			 * don't decrement in this case.
+			 */
+			if (vp->v_count > 1)
+				VN_RELE_LOCKED(vp);
+		} else {
+			VN_RELE_LOCKED(vp);
+		}
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&tp->tn_tlock);
 		rw_exit(&tp->tn_rwlock);
+		/* If the filesystem was umounted by force, rele the vfs ref */
+		if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+			VFS_RELE(tm->tm_vfsp);
 		return;
 	}
 
@@ -1705,7 +1755,7 @@ top:
 			goto top;
 		}
 		if (tp->tn_type == VLNK)
-			tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
+			tmp_kmem_free(tm, tp->tn_symlink, tp->tn_size + 1);
 	}
 
 	/*
@@ -1739,7 +1789,11 @@ top:
 	rw_destroy(&tp->tn_rwlock);
 	mutex_destroy(&tp->tn_tlock);
 	vn_free(TNTOV(tp));
-	tmp_memfree(tp, sizeof (struct tmpnode));
+	tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
+
+	/* If the filesystem was umounted by force, rele the vfs ref */
+	if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+		VFS_RELE(tm->tm_vfsp);
 }
 
 /* ARGSUSED2 */
@@ -1861,6 +1915,10 @@ tmp_getapage(
 	struct vnode *pvp;
 	u_offset_t poff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (protp != NULL)
 		*protp = PROT_ALL;
 again:
@@ -2082,6 +2140,10 @@ tmp_putapage(
 	u_offset_t offset;
 	u_offset_t tmpoff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	ASSERT(PAGE_LOCKED(pp));
 
 	/* Kluster in tmp_klustsize chunks */
@@ -2342,8 +2404,13 @@ tmp_space(
 			return (EFBIG);
 		error = tmp_freesp(vp, bfp, flag);
 
-		if (error == 0 && bfp->l_start == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (bfp->l_start == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c
index c1e2c74a87..def046a0bf 100644
--- a/usr/src/uts/common/fs/udfs/udf_dir.c
+++ b/usr/src/uts/common/fs/udfs/udf_dir.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -562,9 +563,8 @@ out:
 				    namep, ctp);
 			}
 
-			if (sdp != tdp) {
-				vnevent_rename_dest_dir(ITOV(tdp), ctp);
-			}
+			vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip),
+			    namep, ctp);
 		}
 
 		/*
diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c
index 054056c63a..51ce9b28af 100644
--- a/usr/src/uts/common/fs/udfs/udf_vnops.c
+++ b/usr/src/uts/common/fs/udfs/udf_vnops.c
@@ -569,8 +569,11 @@ udf_setattr(
 			goto update_inode;
 		}
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(vp, ct);
+		} else {
+			vnevent_resize(vp, ct);
+		}
 	}
 	/*
 	 * Change file access or modified times.
@@ -1649,8 +1652,13 @@ udf_space(
 	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
 		error = ud_freesp(vp, bfp, flag, cr);
 
-		if (error == 0 && bfp->l_start == 0)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (bfp->l_start == 0) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 
 	return (error);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index 2be623f755..8aa961e340 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -2084,8 +2084,13 @@ again:
 			goto update_inode;
 		}
 
-		if (error == 0 && vap->va_size)
-			vnevent_truncate(vp, ct);
+		if (error == 0) {
+			if (vap->va_size) {
+				vnevent_truncate(vp, ct);
+			} else {
+				vnevent_resize(vp, ct);
+			}
+		}
 	}
 
 	if (ulp) {
@@ -3610,12 +3615,7 @@ retry_firstlock:
 
 	if (error == 0) {
 		vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
-		/*
-		 * Notify the target directory of the rename event
-		 * if source and target directories are not the same.
-		 */
-		if (sdvp != tdvp)
-			vnevent_rename_dest_dir(tdvp, ct);
+		vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
 	}
 
 errout:
@@ -4350,8 +4350,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
 				return (error);
 			error = ufs_freesp(vp, bfp, flag, cr);
 
-			if (error == 0 && bfp->l_start == 0)
-				vnevent_truncate(vp, ct);
+			if (error == 0) {
+				if (bfp->l_start == 0) {
+					vnevent_truncate(vp, ct);
+				} else {
+					vnevent_resize(vp, ct);
+				}
+			}
 		} else if (cmd == F_ALLOCSP) {
 			error = ufs_lockfs_begin(ufsvfsp, &ulp,
 			    ULOCKFS_FALLOCATE_MASK);
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index 3cd2feebef..460d15bcbd 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -857,9 +857,11 @@ vfs_mountroot(void)
 	for (p = practive; p != NULL; p = p->p_next) {
 		ASSERT(p == &p0 || p->p_parent == &p0);
 
+		mutex_enter(&p->p_lock);
 		PTOU(p)->u_cdir = rootdir;
 		VN_HOLD(PTOU(p)->u_cdir);
 		PTOU(p)->u_rdir = NULL;
+		mutex_exit(&p->p_lock);
 	}
 	mutex_exit(&pidlock);
 
@@ -3883,6 +3885,8 @@ vfs_to_modname(const char *vfstype)
 		vfstype = "fdfs";
 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
 		vfstype = "nfs";
+	} else if (strcmp(vfstype, "lxproc") == 0) {
+		vfstype = "lxprocfs";
 	}
 
 	return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 4e73f7f6e6..953ee80471 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -25,6 +25,7 @@
  * Copyright 2022 Spencer Evans-Cole.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -209,6 +210,11 @@ static void		(**vsd_destructor)(void *);
 		cr = crgetmapped(cr);					\
 	}
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
@@ -849,6 +855,36 @@ vn_rele(vnode_t *vp)
 	mutex_exit(&vp->v_lock);
 }
 
+void
+vn_phantom_rele(vnode_t *vp)
+{
+	mutex_enter(&vp->v_lock);
+	VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
+	vp->v_phantom_count--;
+	DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
+	if (vp->v_count == 1) {
+		ASSERT0(vp->v_phantom_count);
+		mutex_exit(&vp->v_lock);
+		VOP_INACTIVE(vp, CRED(), NULL);
+		return;
+	}
+	VERIFY(vp->v_count > 0);
+	VN_RELE_LOCKED(vp);
+	mutex_exit(&vp->v_lock);
+}
+
+/*
+ * Return the number of non-phantom holds. Things such as portfs will use
+ * phantom holds to prevent it from blocking filesystems from mounting over
+ * watched directories.
+ */
+uint_t
+vn_count(vnode_t *vp)
+{
+	ASSERT(MUTEX_HELD(&vp->v_lock));
+	return (vp->v_count - vp->v_phantom_count);
+}
+
 /*
  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
  * as a single reference, so v_count is not decremented until the last DNLC hold
@@ -1130,7 +1166,20 @@ top:
 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
 	 */
 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
-		error = ELOOP;
+		/*
+		 * The __FLXPATH flag is a private interface for use by the lx
+		 * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
+		 * when a symbolic link is encountered, returns a file
+		 * descriptor which references it.
+		 * See uts/common/brand/lx/syscall/lx_open.c
+		 *
+		 * When this flag is set, VOP_OPEN() is not called (for a
+		 * symlink, most filesystems will return ENOSYS anyway)
+		 * and the link's vnode is returned to be linked to the
+		 * file descriptor.
+		 */
+		if ((filemode & __FLXPATH) == 0)
+			error = ELOOP;
 		goto out;
 	}
 	if (filemode & FNOLINKS) {
@@ -2441,6 +2490,7 @@ vn_reinit(vnode_t *vp)
 {
 	vp->v_count = 1;
 	vp->v_count_dnlc = 0;
+	vp->v_phantom_count = 0;
 	vp->v_vfsp = NULL;
 	vp->v_stream = NULL;
 	vp->v_vfsmountedhere = NULL;
@@ -2497,6 +2547,7 @@ vn_free(vnode_t *vp)
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
+	ASSERT0(vp->v_phantom_count);
 	VERIFY(vp->v_path != NULL);
 	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
@@ -2587,6 +2638,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
+	(void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
 }
 
@@ -2601,12 +2653,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
 }
 
 void
-vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
+    caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
-	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
 }
 
 void
@@ -2693,6 +2746,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct)
 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
 }
 
+void
+vnevent_resize(vnode_t *vp, caller_context_t *ct)
+{
+	if (vp == NULL || vp->v_femhead == NULL) {
+		return;
+	}
+	(void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
+}
+
 /*
  * Vnode accessors.
  */
@@ -3468,14 +3530,58 @@ fop_read(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start = 0, lat;
+	ssize_t len;
+	int err;
+
+	if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
+	    vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_runq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, read,
-	    read_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+	if (start != 0) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.reads++;
+		zonep->zone_vfs_rwstats.nread += len;
+		kstat_runq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3487,14 +3593,63 @@ fop_write(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start = 0, lat;
+	ssize_t len;
+	int	err;
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for VFS write operations.  There's no
+	 * actual wait queue for VFS operations.
+	 */
+	if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
+	    vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, write,
-	    write_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+	if (start != 0) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.writes++;
+		zonep->zone_vfs_rwstats.nwritten += len;
+		kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c
index b2004f3d42..b841a8f38e 100644
--- a/usr/src/uts/common/fs/zfs/abd.c
+++ b/usr/src/uts/common/fs/zfs/abd.c
@@ -171,7 +171,10 @@ int zfs_abd_scatter_min_size = 512 * 3;
  * it at runtime would cause ABD iteration to work incorrectly for ABDs which
  * were allocated with the old size, so a safeguard has been put in place which
  * will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
+ * within a scattered ABD. Note that tuning this value to be smaller than the
+ * page size can induce heavy fragmentation in the slab layer, which may itself
+ * result in more memory waste than is saved by the smaller chunk size -- and
+ * will induces more computational work in the slab layer. Tune with caution!
  */
 size_t zfs_abd_chunk_size = 4096;
 
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index bf8b77f268..12b5872cdc 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -284,6 +284,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 #include <sys/zio_checksum.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
@@ -349,7 +350,7 @@ int arc_grow_retry = 60;
 int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
+int zfs_arc_overflow_shift = 3;
 
 /* shift of arc_c for calculating both min and max arc_p */
 int arc_p_min_shift = 4;
@@ -6112,6 +6113,14 @@ top:
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_FLAG_WAIT)
 			return (zio_wait(rzio));
 
@@ -7168,6 +7177,10 @@ arc_init(void)
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
+	/* On larger-memory machines, we clamp the minimum at 1GB */
+	if (zfs_arc_min == 0)
+		arc_c_min = MIN(arc_c_min, (1 << 30));
+
 	if (zfs_arc_meta_min > 0) {
 		arc_meta_min = zfs_arc_meta_min;
 	} else {
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index f610268bf4..38c4a83cb1 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -1125,8 +1125,17 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 		arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
 		if (bonuslen < max_bonuslen)
 			bzero(db->db.db_data, max_bonuslen);
-		if (bonuslen)
-			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+		if (bonuslen) {
+			/*
+			 * Absent byzantine on-disk corruption, we fully expect
+			 * our bonuslen to be no more than max_bonuslen --
+			 * but we nonetheless explicitly clamp it on the bcopy()
+			 * to prevent any on-disk corruption from becoming
+			 * rampant in-kernel corruption.
+			 */
+			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+			    MIN(bonuslen, max_bonuslen));
+		}
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index b7135df3fa..d91a48e2ca 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright 2016 RackTop Systems.
  * Copyright (c) 2014 Integros [integros.com]
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 53d5765bcb..6cb39d61a5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -213,6 +213,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
 
 	if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 02cad5f98e..c3d24abb3d 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -43,6 +43,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
@@ -1413,7 +1414,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
-			    MSEC2NSEC(10), MSEC2NSEC(10));
+			    zfs_zone_txg_delay(), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index bc6f9aff77..d3901c6f79 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -44,6 +44,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/bptree.h>
@@ -905,7 +906,7 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 	}
 	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
-	ASSERT3U(dp->dp_dirty_total, >=, space);
+	VERIFY3U(dp->dp_dirty_total, >=, space);
 	dsl_pool_dirty_delta(dp, -space);
 	mutex_exit(&dp->dp_lock);
 }
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 68733f47c1..4828824b10 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
 
@@ -71,6 +72,11 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 int zfs_condense_pct = 200;
 
 /*
+ * Never condense any space map.  This is for debugging/recovery only.
+ */
+int zfs_condense_never = 0;
+
+/*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
@@ -863,6 +869,7 @@ metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	metaslab_group_t *mgprev, *mgnext;
+	char kstat_name[KSTAT_STRLEN];
 
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
@@ -887,6 +894,33 @@ metaslab_group_activate(metaslab_group_t *mg)
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
+
+	/* Create a kstat to monitor the loading and unloading of metaslabs. */
+	(void) snprintf(kstat_name, sizeof (kstat_name), "%llx",
+	    (unsigned long long) mg->mg_vd->vdev_guid);
+
+	mutex_init(&mg->mg_kstat_lock, NULL, MUTEX_DEFAULT, NULL);
+	if ((mg->mg_kstat = kstat_create("zfs_metaslab_group", 0,
+	    kstat_name, "misc", KSTAT_TYPE_NAMED,
+	    sizeof (metaslab_group_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+
+		metaslab_group_kstat_t *mg_kstat = kmem_zalloc(
+		    sizeof (metaslab_group_kstat_t), KM_SLEEP);
+		kstat_named_init(&mg_kstat->mg_loads, "loads",
+		    KSTAT_DATA_UINT64);
+		kstat_named_init(&mg_kstat->mg_unloads, "unloads",
+		    KSTAT_DATA_UINT64);
+		kstat_named_init(&mg_kstat->mg_spa_name, "spa_name",
+		    KSTAT_DATA_STRING);
+		kstat_named_setstr(&mg_kstat->mg_spa_name,
+		    mg->mg_vd->vdev_spa->spa_name);
+
+		mg->mg_kstat->ks_data = mg_kstat;
+		mg->mg_kstat->ks_lock = &mg->mg_kstat_lock;
+		kstat_install(mg->mg_kstat);
+	}
+
 	mc->mc_rotor = mg;
 }
 
@@ -963,6 +997,14 @@ metaslab_group_passivate(metaslab_group_t *mg)
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
+
+	if (mg->mg_kstat != NULL) {
+		metaslab_group_kstat_t *data = mg->mg_kstat->ks_data;
+
+		kstat_delete(mg->mg_kstat);
+		kmem_free(data, sizeof (metaslab_group_kstat_t));
+	}
+	mutex_destroy(&mg->mg_kstat_lock);
 }
 
 boolean_t
@@ -2400,6 +2442,7 @@ metaslab_load_impl(metaslab_t *msp)
 int
 metaslab_load(metaslab_t *msp)
 {
+	kstat_t *ksp;
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
@@ -2412,6 +2455,12 @@ metaslab_load(metaslab_t *msp)
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
+	ksp = msp->ms_group->mg_kstat;
+	if (ksp != NULL) {
+		metaslab_group_kstat_t *mg_ksp = ksp->ks_data;
+		atomic_inc_64(&mg_ksp->mg_loads.value.ui64);
+	}
+
 	/*
 	 * We set the loading flag BEFORE potentially dropping the lock to
 	 * wait for an ongoing flush (see ms_flushing below). This way other
@@ -4290,12 +4339,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 
 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
-	 * from it in 'metaslab_unload_delay' txgs, then unload it.
+	 * from it in 'metaslab_unload_delay' txgs, then we normally unload it.
 	 */
 	if (msp->ms_loaded &&
 	    msp->ms_disabled == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
-
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
@@ -4539,8 +4587,6 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 		msp->ms_allocating_total += size;
 
-		/* Track the last successful allocation */
-		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index d6e230fbb4..db3317e4cd 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -1961,6 +1961,12 @@ spa_check_for_missing_logs(spa_t *spa)
 		if (idx > 0) {
 			spa_load_failed(spa, "some log devices are missing");
 			vdev_dbgmsg_print_tree(rvd, 2);
+
+			/* Save the timestamp of the last completed txg. */
+			VERIFY(nvlist_add_uint64(spa->spa_load_info,
+			    ZPOOL_CONFIG_LOAD_TIME,
+			    spa->spa_last_ubsync_txg_ts) == 0);
+
 			return (SET_ERROR(ENXIO));
 		}
 	} else {
@@ -1969,10 +1975,21 @@ spa_check_for_missing_logs(spa_t *spa)
 
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+				nvlist_t *rewind_info = fnvlist_alloc();
+
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 				spa_load_note(spa, "some log devices are "
 				    "missing, ZIL is dropped.");
 				vdev_dbgmsg_print_tree(rvd, 2);
+
+				VERIFY(nvlist_add_uint64(rewind_info,
+				    ZPOOL_CONFIG_LOAD_TIME,
+				    spa->spa_uberblock.ub_timestamp) == 0);
+
+				VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+				    ZPOOL_CONFIG_REWIND_INFO,
+				    rewind_info) == 0);
+
 				break;
 			}
 		}
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index bec7bdef2e..6adc8fa14e 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -283,8 +283,17 @@ struct metaslab_group {
 	boolean_t		mg_disabled_updating;
 	kmutex_t		mg_ms_disabled_lock;
 	kcondvar_t		mg_ms_disabled_cv;
+
+	kstat_t			*mg_kstat;
+	kmutex_t		mg_kstat_lock;
 };
 
+typedef struct metaslab_group_kstat {
+	kstat_named_t	mg_loads;
+	kstat_named_t	mg_unloads;
+	kstat_named_t	mg_spa_name;
+} metaslab_group_kstat_t;
+
 /*
  * This value defines the number of elements in the ms_lbas array. The value
  * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
@@ -491,7 +500,6 @@ struct metaslab {
 	hrtime_t	ms_unload_time;	/* time last unloaded */
 	hrtime_t	ms_selected_time; /* time last allocated from */
 
-	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
 
 	/*
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index d542368e7c..d760127ed9 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -151,6 +151,7 @@ struct vdev_queue {
 	avl_tree_t	vq_write_offset_tree;
 	avl_tree_t	vq_trim_offset_tree;
 	uint64_t	vq_last_offset;
+	zoneid_t	vq_last_zone_id;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
 };
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..f1431b3f55
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern hrtime_t zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t,
+    avl_tree_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index d03106b942..7592614d6d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -394,8 +394,14 @@ typedef int zio_pipe_stage_t(zio_t *zio);
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
-#define	ZIO_REEXECUTE_NOW	0x01
-#define	ZIO_REEXECUTE_SUSPEND	0x02
+#define	ZIO_REEXECUTE_NOW		0x01
+#define	ZIO_REEXECUTE_SUSPEND		0x02
+#define	ZIO_REEXECUTE_NO_SUSPEND	0x04
+
+#define	ZIO_SHOULD_REEXECUTE(x)		\
+	((x)->io_reexecute & ZIO_REEXECUTE_NOW || \
+	((x)->io_reexecute & ZIO_REEXECUTE_SUSPEND && \
+	(((x)->io_reexecute & ZIO_REEXECUTE_NO_SUSPEND) == 0)))
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
@@ -465,6 +471,7 @@ struct zio {
 	hrtime_t	io_timestamp;
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
@@ -500,6 +507,7 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index a8670dcaa8..a99e581737 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -32,6 +32,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/zil.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ZFS Transaction Groups
@@ -535,6 +536,8 @@ txg_sync_thread(void *arg)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index cd5e80d769..228529d9fe 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -28,6 +28,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_impl.h>
@@ -165,6 +166,8 @@ vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
     int ldi_result, void *arg, void *ev_data __unused)
 {
 	vdev_t *vd = (vdev_t *)arg;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	/*
 	 * Ignore events other than offline.
@@ -764,6 +767,7 @@ static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	if (vd->vdev_reopening || dvd == NULL)
 		return;
@@ -1028,6 +1032,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/*
 	 * In general we would expect ldi_strategy() to return non-zero only
 	 * because of programming errors, but we've also seen this fail shortly
@@ -1044,6 +1050,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 4c6515c43d..b40126cac0 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*
@@ -35,6 +36,7 @@
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 #include <sys/metaslab_impl.h>
 #include <sys/abd.h>
 
@@ -145,7 +147,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10;
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_min_active = 3;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
@@ -274,6 +276,8 @@ vdev_queue_init(vdev_t *vd)
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
+	vq->vq_last_zone_id = 0;
+
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);
 
@@ -318,6 +322,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 	spa_t *spa = zio->io_spa;
 
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -334,6 +339,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 	spa_t *spa = zio->io_spa;
 
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -732,7 +738,11 @@ again:
 	search.io_timestamp = 0;
 	search.io_offset = vq->vq_last_offset - 1;
 	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+#ifdef _KERNEL
+	zio = zfs_zone_schedule(vq, p, idx, tree);
+#else
 	zio = avl_nearest(tree, idx, AVL_AFTER);
+#endif
 	if (zio == NULL)
 		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
@@ -890,9 +900,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 		spa_t *spa = zio->io_spa;
 		zio_priority_t oldpri = zio->io_priority;
 
+		zfs_zone_zio_dequeue(zio);
 		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 		zio->io_priority = priority;
 		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+		zfs_zone_zio_enqueue(zio);
 
 		mutex_enter(&spa->spa_iokstat_lock);
 		ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0);
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index f479ea9f30..b74baf46ea 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -696,9 +696,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
+	case ZFS_PROP_DEDUP:
 	case ZFS_PROP_ZONED:
 		/*
-		 * Disallow setting of 'zoned' from within a local zone.
+		 * Disallow setting these properties from within a local zone.
 		 */
 		if (!INGLOBALZONE(curproc))
 			return (SET_ERROR(EPERM));
@@ -1022,6 +1023,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
+	if (secpolicy_fs_import(cr) != 0)
+		return (set_errno(EPERM));
+
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
 		return (error);
@@ -2162,7 +2166,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -2180,7 +2185,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -2207,11 +2213,24 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != (uintptr_t)NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
-		error = zfs_ioc_objset_stats_impl(zc, os);
+		error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 		dmu_objset_rele(os, FTAG);
 	}
 
@@ -2406,8 +2425,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != (uintptr_t)NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
@@ -2437,8 +2469,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -3148,6 +3182,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -3194,8 +3229,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -3204,13 +3240,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
@@ -6591,7 +6629,8 @@ error:
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
-zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+zfs_ioctl_register_legacy(const char *name, zfs_ioc_t ioc,
+    zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
 {
@@ -6602,6 +6641,7 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
 	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
 	ASSERT3P(vec->zvec_func, ==, NULL);
 
+	vec->zvec_name = name;
 	vec->zvec_legacy_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
@@ -6645,7 +6685,7 @@ zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
     zfs_ioc_poolcheck_t pool_check)
 {
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
 	    POOL_NAME, log_history, pool_check);
 }
 
@@ -6653,14 +6693,15 @@ static void
 zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
 {
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, pool_check);
 }
 
 static void
-zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+zfs_ioctl_register_pool_modify(const char *name, zfs_ioc_t ioc,
+    zfs_ioc_legacy_func_t *func)
 {
-	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+	zfs_ioctl_register_legacy(name, ioc, func, zfs_secpolicy_config,
 	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
@@ -6668,7 +6709,7 @@ static void
 zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
 	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
 }
 
@@ -6676,7 +6717,7 @@ static void
 zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
     zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
 {
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
 }
 
@@ -6688,10 +6729,10 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 }
 
 static void
-zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy)
+zfs_ioctl_register_dataset_modify(const char *name, zfs_ioc_t ioc,
+    zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
 {
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	zfs_ioctl_register_legacy(name, ioc, func, secpolicy,
 	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
@@ -6838,34 +6879,35 @@ zfs_ioctl_init(void)
 
 	/* IOCTLS that use the legacy function signature */
 
-	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
-	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+	zfs_ioctl_register_legacy("pool_freeze", ZFS_IOC_POOL_FREEZE,
+	    zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+	zfs_ioctl_register_pool_modify("pool_scan", ZFS_IOC_POOL_SCAN,
 	    zfs_ioc_pool_scan);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+	zfs_ioctl_register_pool_modify("pool_upgrade", ZFS_IOC_POOL_UPGRADE,
 	    zfs_ioc_pool_upgrade);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+	zfs_ioctl_register_pool_modify("vdev_add", ZFS_IOC_VDEV_ADD,
 	    zfs_ioc_vdev_add);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+	zfs_ioctl_register_pool_modify("vdev_remove", ZFS_IOC_VDEV_REMOVE,
 	    zfs_ioc_vdev_remove);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+	zfs_ioctl_register_pool_modify("vdev_set_state", ZFS_IOC_VDEV_SET_STATE,
 	    zfs_ioc_vdev_set_state);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+	zfs_ioctl_register_pool_modify("vdev_attach", ZFS_IOC_VDEV_ATTACH,
 	    zfs_ioc_vdev_attach);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+	zfs_ioctl_register_pool_modify("vdev_detach", ZFS_IOC_VDEV_DETACH,
 	    zfs_ioc_vdev_detach);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+	zfs_ioctl_register_pool_modify("vdev_setpath", ZFS_IOC_VDEV_SETPATH,
 	    zfs_ioc_vdev_setpath);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+	zfs_ioctl_register_pool_modify("vdev_setfru", ZFS_IOC_VDEV_SETFRU,
 	    zfs_ioc_vdev_setfru);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+	zfs_ioctl_register_pool_modify("pool_set_props", ZFS_IOC_POOL_SET_PROPS,
 	    zfs_ioc_pool_set_props);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+	zfs_ioctl_register_pool_modify("vdev_split", ZFS_IOC_VDEV_SPLIT,
 	    zfs_ioc_vdev_split);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+	zfs_ioctl_register_pool_modify("pool_reguid", ZFS_IOC_POOL_REGUID,
 	    zfs_ioc_pool_reguid);
 
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
@@ -6943,20 +6985,20 @@ zfs_ioctl_init(void)
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
 	    zfs_ioc_send, zfs_secpolicy_send);
 
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
-	    zfs_secpolicy_none);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
-	    zfs_secpolicy_destroy);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
-	    zfs_secpolicy_rename);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+	zfs_ioctl_register_dataset_modify("set_prop", ZFS_IOC_SET_PROP,
+	    zfs_ioc_set_prop, zfs_secpolicy_none);
+	zfs_ioctl_register_dataset_modify("destroy", ZFS_IOC_DESTROY,
+	    zfs_ioc_destroy, zfs_secpolicy_destroy);
+	zfs_ioctl_register_dataset_modify("rename", ZFS_IOC_RENAME,
+	    zfs_ioc_rename, zfs_secpolicy_rename);
+	zfs_ioctl_register_dataset_modify("recv", ZFS_IOC_RECV, zfs_ioc_recv,
 	    zfs_secpolicy_recv);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
-	    zfs_secpolicy_promote);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+	zfs_ioctl_register_dataset_modify("promote", ZFS_IOC_PROMOTE,
+	    zfs_ioc_promote, zfs_secpolicy_promote);
+	zfs_ioctl_register_dataset_modify("inherit_prop", ZFS_IOC_INHERIT_PROP,
 	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
-	    zfs_secpolicy_set_fsacl);
+	zfs_ioctl_register_dataset_modify("set_fsacl", ZFS_IOC_SET_FSACL,
+	    zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl);
 
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
 	    zfs_secpolicy_share, POOL_CHECK_NONE);
@@ -7333,7 +7375,32 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
 		nvlist_free(outnvl);
 	} else {
+		spa_t *spa;
+		uint64_t orig_cookie = zc->zc_cookie;
+
 		error = vec->zvec_legacy_func(zc);
+
+		if (error == 0 && vec->zvec_allow_log &&
+		    vec->zvec_name != NULL &&
+		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
+			nvlist_t *lognv = NULL;
+			char *msg;
+			uint_t len = strlen(vec->zvec_name) +
+			    strlen(zc->zc_name) + 128;
+
+			msg = kmem_alloc(len, KM_SLEEP);
+
+			lognv = fnvlist_alloc();
+			(void) snprintf(msg, len,
+			    "%s pool: %s cookie: %lu guid: %lx", vec->zvec_name,
+			    zc->zc_name, orig_cookie, zc->zc_guid);
+			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, msg);
+
+			(void) spa_history_log_nvl(spa, lognv);
+			spa_close(spa, FTAG);
+			fnvlist_free(lognv);
+			kmem_free(msg, len);
+		}
 	}
 
 out:
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 288dc93e3c..95a2be6239 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -23,7 +23,7 @@
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
  * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  * Copyright 2022 Oxide Computer Company
@@ -1917,7 +1917,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
-	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+	    (vn_count(mvp) != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (SET_ERROR(EBUSY));
 	}
@@ -2342,6 +2342,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 
+	/*
+	 * If we're doing a forced unmount on a dataset which still has
+	 * references and is in a zone, then we need to cleanup the zone
+	 * reference at this point or else the zone will never be able to
+	 * shutdown.
+	 */
+	if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+		zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+		vfsp->vfs_zone = NULL;
+	}
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 1ee01c9146..dd58b4a549 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -847,6 +847,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
+	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
+	 */
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(n, uio);
+
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -901,17 +912,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	}
 
 	/*
-	 * Pre-fault the pages to ensure slow (eg NFS) pages
-	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
-	 */
-	if ((uio->uio_extflg == UIO_XUIO) &&
-	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
-		xuio = (xuio_t *)uio;
-	else
-		uio_prefaultpages(MIN(n, max_blksz), uio);
-
-	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	locked_range_t *lr;
@@ -1147,9 +1147,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
-
-		if (!xuio && n > 0)
-			uio_prefaultpages(MIN(n, max_blksz), uio);
 	}
 
 	rangelock_exit(lr);
@@ -3164,8 +3161,11 @@ top:
 			return (err);
 		}
 
-		if (vap->va_size == 0)
+		if (vap->va_size == 0) {
 			vnevent_truncate(ZTOV(zp), ct);
+		} else {
+			vnevent_resize(ZTOV(zp), ct);
+		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
@@ -4173,9 +4173,7 @@ top:
 
 	if (error == 0) {
 		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
-		/* notify the target dir if it is not the same as source dir */
-		if (tdvp != sdvp)
-			vnevent_rename_dest_dir(tdvp, ct);
+		vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
 	}
 out:
 	if (zl != NULL)
@@ -5265,8 +5263,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
-	if (error == 0 && off == 0 && len == 0)
-		vnevent_truncate(ZTOV(zp), ct);
+	if (error == 0 && len == 0) {
+		if (off == 0) {
+			vnevent_truncate(ZTOV(zp), ct);
+		} else {
+			vnevent_resize(ZTOV(zp), ct);
+		}
+	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..f151595095
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1419 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ *     1) Ensure consistent and predictable I/O latency across all zones.
+ *     2) Sequential and random workloads have very different characteristics,
+ *        so it is a non-starter to track IOPS or throughput.
+ *     3) A zone should be able to use the full disk bandwidth if no other zone
+ *        is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ *     (# of read syscalls) x (Average read latency) +
+ *     (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	return (MSEC2NSEC(10));
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint8_t		zfs_zone_delay_step = 5;	/* usec amnt to change delay */
+uint8_t		zfs_zone_delay_ceiling = 100;	/* usec delay max */
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the
+ * threshold at which the throttle will start delaying zones.  When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems).  We
+ * therefore use our derived utilization conservatively:  we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+uint_t		zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here:  zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle.  The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t		zfs_zone_laggard_threshold = 50000;	/* 50 ms */
+uint_t		zfs_zone_laggard_recent = 1000000;	/* 1000 ms */
+uint_t		zfs_zone_laggard_ancient = 5000000;	/* 5000 ms */
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+	uint_t		cycle_cnt;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+	boolean_t zi_underutil;
+	boolean_t zi_overutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ *    ((now - zfs_zone_last_checked) * 1000);
+ */
+kmutex_t	zfs_disk_lock;		/* protects the following: */
+uint_t		zfs_disk_rcnt;		/* Number of outstanding IOs */
+hrtime_t	zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
+
+hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+kmutex_t	zfs_last_check_lock;	/* protects zfs_zone_last_checked */
+hrtime_t	zfs_zone_last_checked = 0;
+hrtime_t	zfs_disk_last_laggard = 0;
+
+/*
+ * Data used to keep track of how often txg sync is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_sync_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on the zfs_vdev_sync_read_max_active value for the
+ * number of I/Os that can be pending on a device.  If there are more than the
+ * max_active ops already queued up, beyond those already issued to the vdev,
+ * then use zone-based scheduling to get the next synchronous zio.
+ */
+uint32_t	zfs_zone_schedule_thresh = 10;
+
+/*
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define	SCHED_WEIGHT_MAX	20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+hrtime_t	zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
+
+typedef struct {
+	int		zq_qdepth;
+	zio_priority_t	zq_queue;
+	int		zq_priority;
+	int		zq_wt;
+	zoneid_t	zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	zone_zfs_io_t *iop;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return;
+	}
+
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops);
+		iop->zpers_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops);
+		iop->zpers_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops);
+		iop->zpers_lwr_ops.cycle_cnt++;
+		break;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static boolean_t
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (B_FALSE);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (B_TRUE);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		atomic_inc_uint(&rd_lat.cycle_cnt);
+		atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat);
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		atomic_inc_uint(&wr_lat.cycle_cnt);
+		atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat);
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+		    uintptr_t, cp->sys_avg_lat,
+		    uintptr_t, cp->cycle_lat,
+		    uintptr_t, cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op,
+    hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zpd, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	ASSERT3P(zpd, !=, NULL);
+
+	*rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops);
+	*wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops);
+
+	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd,
+	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+	    uintptr_t, *wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop = zpd->zpers_zfsp;
+
+	ASSERT3P(iop, !=, NULL);
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) +
+	    (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += iop->zpers_io_util;
+
+	if (iop->zpers_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += iop->zpers_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+	    uint_t, rops, uint_t, wops, uint_t, lwops,
+	    uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri);
+
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_zfs_io_t *zpd)
+{
+	ASSERT3P(zpd, !=, NULL);
+
+	if (zpd->zpers_io_delay < zfs_zone_delay_ceiling)
+		zpd->zpers_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_zfs_io_t *zpd)
+{
+	ASSERT3P(zpd, !=, NULL);
+
+	if (zpd->zpers_io_delay > 0)
+		zpd->zpers_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop = zpd->zpers_zfsp;
+	zoneio_stats_t *sp = arg;
+	uint8_t delay;
+	uint_t fairutil = 0;
+
+	ASSERT3P(iop, !=, NULL);
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	delay = iop->zpers_io_delay;
+	iop->zpers_io_util_above_avg = 0;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (iop->zpers_io_util > fairutil && sp->zi_overutil) {
+		iop->zpers_io_util_above_avg = 1;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(iop);
+	} else if (iop->zpers_io_util < fairutil || sp->zi_underutil ||
+	    sp->zi_active <= 1) {
+		zfs_zone_delay_dec(iop);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+	    uintptr_t, delay, uintptr_t, iop->zpers_io_delay,
+	    uintptr_t, fairutil, uintptr_t, iop->zpers_io_util);
+
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
+{
+	zoneio_stats_t stats;
+	hrtime_t laggard_udelta = 0;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	if (unow > zfs_disk_last_laggard)
+		laggard_udelta = unow - zfs_disk_last_laggard;
+
+	/*
+	 * To minimize porpoising, we have three separate states for our
+	 * assessment of I/O performance:  overutilized, underutilized, and
+	 * neither overutilized nor underutilized.  We will increment the
+	 * throttle if a zone is using more than its fair share _and_ I/O
+	 * is overutilized; we will decrement the throttle if a zone is using
+	 * less than its fair share _or_ I/O is underutilized.
+	 */
+	stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+	    laggard_udelta > zfs_zone_laggard_ancient;
+
+	stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+	    laggard_udelta < zfs_zone_laggard_recent;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+	    uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+	    uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+	    uintptr_t, stats.zi_diskutil);
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	uint_t cnt;
+	zone_q_bump_t *qbp = arg;
+	zio_priority_t p = qbp->zq_queue;
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	cnt = iop->zpers_zfs_queued[p];
+	if (cnt == 0) {
+		iop->zpers_zfs_weight = 0;
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX)
+		iop->zpers_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / cnt) *
+	    iop->zpers_zfs_io_pri * iop->zpers_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * it becomes the new leading contender.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = iop->zpers_zfs_weight;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue. This is only
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
+ *
+ * For single-threaded synchronous processes a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple processes
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel processes (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded processes, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p,
+    avl_tree_t *tree)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	qbump.zq_queue = p;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(tree); zp != NULL;
+		    zp = avl_walk(tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+		    uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out TX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track and throttle IO operations per zone. Called from:
+ *   - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes
+ *     go through this path)
+ *   - arc_read for read ops that miss the ARC (both dataset and zvol)
+ * For each operation, increment that zone's counter based on the type of
+ * operation, then delay the operation, if necessary.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone. We only see logical writes in this
+ *    function, we see the physcial writes in the zfs_zone_zio_start and
+ *    zfs_zone_zio_done functions.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice; first because this function
+ * is always called by a zone thread for logical writes, but then we also will
+ * count the physical writes that are performed at a low level via
+ * zfs_zone_zio_start. Without this, it can look like a non-global zone never
+ * writes (case 1). Depending on when the TXG is synced, the counts may be in
+ * the same sample bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through arc_read and we only come into this
+ * function when we have an arc miss.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zoneid_t zid = curzone->zone_id;
+	zone_persist_t *zpd = &zone_pdata[zid];
+	zone_zfs_io_t *iop;
+	hrtime_t unow;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counter for logical writes here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		add_iop(zpd, unow, type, 0);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return;
+	}
+
+	/*
+	 * If the zone's I/O priority is set to zero, don't throttle that zone's
+	 * operations at all.
+	 */
+	if (iop->zpers_zfs_io_pri == 0) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return;
+	}
+
+	/* Handle periodically updating the per-zone I/O parameters */
+	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+		hrtime_t last_checked;
+		boolean_t do_update = B_FALSE;
+
+		/* Recheck under mutex */
+		mutex_enter(&zfs_last_check_lock);
+		last_checked = zfs_zone_last_checked;
+		if ((unow - last_checked) > zfs_zone_adjust_time) {
+			zfs_zone_last_checked = unow;
+			do_update = B_TRUE;
+		}
+		mutex_exit(&zfs_last_check_lock);
+
+		if (do_update) {
+			mutex_exit(&zpd->zpers_zfs_lock);
+
+			zfs_zone_wait_adjust(unow, last_checked);
+
+			mutex_enter(&zpd->zpers_zfs_lock);
+			iop = zpd->zpers_zfsp;
+			if (iop == NULL) {
+				mutex_exit(&zpd->zpers_zfs_lock);
+				return;
+			}
+		}
+	}
+
+	wait = iop->zpers_io_delay;
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	if (wait > 0) {
+		/*
+		 * If this is a write and we're doing above normal TXG
+		 * syncing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_sync_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid,
+		    uintptr_t, type, uintptr_t, wait);
+
+		drv_usecwait(wait);
+
+		if (curzone->zone_vfs_stats != NULL) {
+			atomic_inc_64(&curzone->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&curzone->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the sync rate is going to be 1.  When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The sync rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_sync_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_sync_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	zone_persist_t *zpd = &zone_pdata[curzone->zone_id];
+	zone_zfs_io_t *iop;
+	uint8_t above;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	above = iop->zpers_io_util_above_avg;
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	if (above) {
+		return (zfs_zone_txg_delay_nsec);
+	}
+
+	return (MSEC2NSEC(10));
+}
+
+/*
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		if (zp->io_type == ZIO_TYPE_READ)
+			kstat_runq_enter(&iop->zpers_zfs_rwstats);
+		iop->zpers_zfs_weight = 0;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+}
+
+/*
+ * Called from vdev_disk_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_persist_t *zpd;
+	zone_zfs_io_t *iop;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if (zp->io_dispatched == 0)
+		return;
+
+	zpd = &zone_pdata[zp->io_zoneid];
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		/*
+		 * To calculate the wsvc_t average, keep a cumulative sum of
+		 * all the wait time before each I/O was dispatched. Since most
+		 * writes are asynchronous, only track the wait time for
+		 * read I/Os.
+		 */
+		if (zp->io_type == ZIO_TYPE_READ) {
+			iop->zpers_zfs_rwstats.reads++;
+			iop->zpers_zfs_rwstats.nread += zp->io_size;
+			iop->zpers_zfs_rd_waittime +=
+			    zp->io_dispatched - zp->io_timestamp;
+			kstat_runq_exit(&iop->zpers_zfs_rwstats);
+		} else {
+			iop->zpers_zfs_rwstats.writes++;
+			iop->zpers_zfs_rwstats.nwritten += zp->io_size;
+		}
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+
+	if (udelta > zfs_zone_laggard_threshold)
+		zfs_disk_last_laggard = unow;
+
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+	}
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+	    uintptr_t, zp->io_type, uintptr_t, udelta);
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		ASSERT(iop->zpers_zfs_queued[p] > 0);
+		if (iop->zpers_zfs_queued[p] == 0) {
+			cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+		} else {
+			iop->zpers_zfs_queued[p]--;
+		}
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		iop->zpers_zfs_queued[p]++;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. That function is where zio's are listed
+ * in FIFO order on one of the sync queues, then pulled off (by
+ * vdev_queue_io_remove) and issued.  We potentially do zone-based scheduling
+ * here to find a zone's zio deeper in the sync queue and issue that instead
+ * of simply doing FIFO.
+ *
+ * We only do zone-based zio scheduling for the two synchronous I/O queues
+ * (read & write). These queues are normally serviced in FIFO order but we
+ * may decide to move a zone's zio to the head of the line. A typical I/O
+ * load will be mostly synchronous reads and some asynchronous writes (which
+ * are scheduled differently due to transaction groups). There will also be
+ * some synchronous writes for those apps which want to ensure their data is on
+ * disk. We want to make sure that a zone with a single-threaded app (e.g. the
+ * shell) that is doing synchronous I/O (typically reads) isn't penalized by
+ * other zones which are doing lots of synchronous I/O because they have many
+ * running threads.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx,
+    avl_tree_t *tree)
+{
+	vdev_queue_class_t *vqc = &vq->vq_class[p];
+	uint_t cnt;
+	zoneid_t last_zone;
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* Don't change the order on the LBA ordered queues. */
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return (avl_nearest(tree, idx, AVL_AFTER));
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	cnt = avl_numnodes(tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few zios in the queue then just issue the head.
+	 * If there are more than a few zios already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zio = avl_nearest(tree, idx, AVL_AFTER);
+	else
+		zio = get_next_zio(vqc, cnt, p, tree);
+
+	vq->vq_last_zone_id = zio->io_zoneid;
+
+	/*
+	 * Probe with 4 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, the zone that was associated
+	 * with the next IO that is scheduled, and which queue (priority).
+	 */
+	DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+	    uint_t, zio->io_zoneid, uint_t, p);
+
+	return (zio);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 450ccb94e5..7a15838338 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -106,8 +107,23 @@ boolean_t zil_nocacheflush = B_FALSE;
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
+ *
+ * The default upstream value for zil_slog_bulk is:
+ *    uint64_t zil_slog_bulk = 768 * 1024;
+ * For SmartOS, we default to using a high value to essentially disable this
+ * behavior.
+ *
+ * Because the default value of this tunable forces some zil_commit writes down
+ * to io_priority ZIO_PRIORITY_ASYNC_WRITE, those zio's would be in the same
+ * zio pipeline queue as all of the async spa_sync zio's. This can lead to
+ * serious latency problems for the user-level application code because it is
+ * blocked on completion of the zil_commit. We see this when a spa_sync zio is
+ * running slow (e.g. when metaslab loading takes a long time in the
+ * zio_dva_allocate pipeline stage), thus delaying all zio's backed up in the
+ * ZIO_PRIORITY_ASYNC_WRITE queue. For SmartOS, we choose to keep all
+ * zil_commmit zio's at ZIO_PRIORITY_SYNC_WRITE.
  */
-uint64_t zil_slog_bulk = 768 * 1024;
+uint64_t zil_slog_bulk = 0x100000000ULL;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
@@ -3079,13 +3095,20 @@ zil_close(zilog_t *zilog)
 		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
 	mutex_exit(&zilog->zl_lock);
 
-	/*
-	 * We need to use txg_wait_synced() to wait long enough for the
-	 * ZIL to be clean, and to wait for all pending lwbs to be
-	 * written out.
-	 */
-	if (txg != 0)
+	if (zilog_is_dirty(zilog)) {
+		/*
+		 * If we're dirty, always wait for the current transaction --
+		 * our lwb_max_txg may be in the past.
+		 */
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	} else if (txg != 0) {
+		/*
+		 * We need to use txg_wait_synced() to wait long enough for the
+		 * ZIL to be clean, and to wait for all pending lwbs to be
+		 * written out.
+		 */
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	}
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index f8a98f73f3..b32dffd79c 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2017, Intel Corporation.
  * Copyright 2020 Joyent, Inc.
@@ -43,6 +44,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/zfs_zone.h>
 #include <sys/time.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
@@ -765,6 +767,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_metaslab_class == NULL)
 			zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
@@ -772,6 +775,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
@@ -4317,6 +4322,24 @@ zio_done(zio_t *zio)
 		}
 	}
 
+	/*
+	 * When we have an error on a slog vdev, we must ensure that the
+	 * zio is not suspended. Suspending the zio will cause dataset deletion
+	 * or an attempt to remove the slog to hang. In both cases, the code
+	 * might be trying to clean up the zil blocks on the slog, but because
+	 * the slog is dead, the suspended zio causes this to hang indefinitely.
+	 * The system properly switches over to using zils on regular storage
+	 * when the slog dies.
+	 *
+	 * This is a reasonable point in the stack to detect that the vdev is
+	 * a slog. The 'no_suspend' flag will propagate up to the logical zio
+	 * via zio_notify_parent.
+	 */
+	if (zio->io_error && vd != NULL && vd->vdev_islog &&
+	    !vdev_accessible(vd, zio)) {
+		zio->io_reexecute |= ZIO_REEXECUTE_NO_SUSPEND;
+	}
+
 	if (zio->io_error && zio == lio) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
@@ -4361,7 +4384,7 @@ zio_done(zio_t *zio)
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
-	if ((zio->io_error || zio->io_reexecute) &&
+	if ((zio->io_error || ZIO_SHOULD_REEXECUTE(zio)) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
@@ -4375,7 +4398,7 @@ zio_done(zio_t *zio)
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute = 0;
 
-	if (zio->io_reexecute) {
+	if (ZIO_SHOULD_REEXECUTE(zio)) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
@@ -4446,7 +4469,7 @@ zio_done(zio_t *zio)
 	}
 
 	ASSERT(zio->io_child_count == 0);
-	ASSERT(zio->io_reexecute == 0);
+	ASSERT(!ZIO_SHOULD_REEXECUTE(zio));
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 2bb311d28d..3d2a42aa46 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -82,6 +82,7 @@
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfeature.h>
@@ -140,6 +141,11 @@ typedef struct zvol_state {
 #define	ZVOL_EXCL	0x4
 #define	ZVOL_WCE	0x8
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * zvol maximum transfer in one DMU tx.
  */
@@ -1342,6 +1348,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	zvol_state_t *zv;
 	uint64_t volsize;
 	int error = 0;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1360,6 +1369,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 
 	smt_begin_unsafe();
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_runq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
 	    uio->uio_loffset, uio->uio_resid, RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1369,6 +1386,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
+		tot_bytes += bytes;
 		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
@@ -1379,6 +1397,38 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	}
 	rangelock_exit(lr);
 
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.reads++;
+	zonep->zone_vfs_rwstats.nread += tot_bytes;
+	kstat_runq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	smt_end_unsafe();
 
 	return (error);
@@ -1393,6 +1443,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	uint64_t volsize;
 	int error = 0;
 	boolean_t sync;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1411,6 +1464,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 
 	smt_begin_unsafe();
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for zvol write operations. There's no
+	 * actual wait queue for zvol operations.
+	 */
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1424,6 +1490,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
+		tot_bytes += bytes;
 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
@@ -1443,8 +1510,40 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
 	smt_end_unsafe();
 
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.writes++;
+	zonep->zone_vfs_rwstats.nwritten += tot_bytes;
+	kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
 	return (error);
 }