80 files changed, 13534 insertions, 896 deletions
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_construct.c b/usr/src/uts/common/fs/bootfs/bootfs_construct.c
new file mode 100644
index 0000000000..b909b5d121
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_construct.c
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * This file takes care of reading the boot time modules and constructing them
+ * into the appropriate series of vnodes.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/vfs.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+kmem_cache_t *bootfs_node_cache;
+
+static const vattr_t bootfs_vattr_dir = {
+	AT_ALL,					/* va_mask */
+	VDIR,					/* va_type */
+	S_IFDIR | 0555,				/* va_mode */
+	0,					/* va_uid */
+	0,					/* va_gid */
+	0,					/* va_fsid */
+	0,					/* va_nodeid */
+	1,					/* va_nlink */
+	0,					/* va_size */
+	0,					/* va_atime */
+	0,					/* va_mtime */
+	0,					/* va_ctime */
+	0,					/* va_rdev */
+	0,					/* va_blksize */
+	0,					/* va_nblocks */
+	0					/* va_seq */
+};
+
+static const vattr_t bootfs_vattr_reg = {
+	AT_ALL,					/* va_mask */
+	VREG,					/* va_type */
+	S_IFREG | 0555,				/* va_mode */
+	0,					/* va_uid */
+	0,					/* va_gid */
+	0,					/* va_fsid */
+	0,					/* va_nodeid */
+	1,					/* va_nlink */
+	0,					/* va_size */
+	0,					/* va_atime */
+	0,					/* va_mtime */
+	0,					/* va_ctime */
+	0,					/* va_rdev */
+	0,					/* va_blksize */
+	0,					/* va_nblocks */
+	0					/* va_seq */
+};
+
+/*ARGSUSED*/
+int
+bootfs_node_constructor(void *buf, void *arg, int kmflags)
+{
+	bootfs_node_t *bnp = buf;
+
+	bnp->bvn_vnp = vn_alloc(kmflags);
+	if (bnp->bvn_vnp == NULL)
+		return (-1);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+bootfs_node_destructor(void *buf, void *arg)
+{
+	bootfs_node_t *bnp = buf;
+
+	vn_free(bnp->bvn_vnp);
+}
+
+static int
+bootfs_comparator(const void *a, const void *b)
+{
+	const bootfs_node_t *lfs, *rfs;
+	int ret;
+
+	lfs = a;
+	rfs = b;
+
+	ret = strcmp(lfs->bvn_name, rfs->bvn_name);
+	if (ret > 0)
+		ret = 1;
+	if (ret < 0)
+		ret = -1;
+	return (ret);
+}
+
+static void
+bootfs_node_init(bootfs_t *bfs, bootfs_node_t *bnp, const struct vattr *vap,
+    const char *name, size_t namelen)
+{
+	timestruc_t now;
+
+	vn_reinit(bnp->bvn_vnp);
+
+	bnp->bvn_vnp->v_flag |= VNOSWAP;
+	bnp->bvn_vnp->v_type = vap->va_type;
+	bnp->bvn_vnp->v_vfsp = bfs->bfs_vfsp;
+	bnp->bvn_vnp->v_rdev = 0;
+	bnp->bvn_vnp->v_data = (caddr_t)bnp;
+	vn_setops(bnp->bvn_vnp, bootfs_vnodeops);
+
+	bnp->bvn_name = kmem_alloc(namelen + 1, KM_SLEEP);
+	bcopy(name, bnp->bvn_name, namelen);
+	bnp->bvn_name[namelen] = '\0';
+	if (vap->va_type == VDIR) {
+		avl_create(&bnp->bvn_dir, bootfs_comparator,
+		    sizeof (bootfs_node_t),
+		    offsetof(bootfs_node_t, bvn_link));
+	}
+	bzero(&bnp->bvn_link, sizeof (avl_node_t));
+	bcopy(vap, &bnp->bvn_attr, sizeof (vattr_t));
+
+	gethrestime(&now);
+	bnp->bvn_attr.va_atime = now;
+	bnp->bvn_attr.va_ctime = now;
+	bnp->bvn_attr.va_mtime = now;
+	bnp->bvn_attr.va_fsid = makedevice(bootfs_major, bfs->bfs_minor);
+	bnp->bvn_attr.va_nodeid = bfs->bfs_ninode;
+	bnp->bvn_attr.va_blksize = PAGESIZE;
+	bfs->bfs_ninode++;
+	list_insert_tail(&bfs->bfs_nodes, bnp);
+}
+
+static void
+bootfs_mkroot(bootfs_t *bfs)
+{
+	bootfs_node_t *bnp;
+
+	bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP);
+	bootfs_node_init(bfs, bnp, &bootfs_vattr_dir, "/", 1);
+	bnp->bvn_vnp->v_flag |= VROOT;
+	bnp->bvn_parent = bnp;
+	bfs->bfs_rootvn = bnp;
+	bfs->bfs_stat.bfss_ndirs.value.ui32++;
+	vn_exists(bnp->bvn_vnp);
+}
+
+static int
+bootfs_mknode(bootfs_t *bfs, bootfs_node_t *parent, bootfs_node_t **outp,
+    const char *name, size_t namelen, const vattr_t *vap, uintptr_t addr,
+    uint64_t size)
+{
+	bootfs_node_t *bnp;
+	bootfs_node_t sn;
+	avl_index_t where;
+	char *buf;
+
+	ASSERT(parent->bvn_attr.va_type == VDIR);
+	buf = kmem_alloc(namelen + 1, KM_SLEEP);
+	bcopy(name, buf, namelen);
+	buf[namelen] = '\0';
+	sn.bvn_name = buf;
+	if ((bnp = avl_find(&parent->bvn_dir, &sn, &where)) != NULL) {
+		kmem_free(buf, namelen + 1);
+		/* Directories can collide, files cannot */
+		if (vap->va_type == VDIR) {
+			*outp = bnp;
+			return (0);
+		}
+		return (EEXIST);
+	}
+	kmem_free(buf, namelen + 1);
+
+	bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP);
+	bootfs_node_init(bfs, bnp, vap, name, namelen);
+	bnp->bvn_parent = parent;
+	avl_add(&parent->bvn_dir, bnp);
+	*outp = bnp;
+
+	if (vap->va_type == VDIR) {
+		parent->bvn_attr.va_size++;
+		parent->bvn_attr.va_nlink++;
+		bfs->bfs_stat.bfss_ndirs.value.ui32++;
+	} else {
+		bnp->bvn_addr = addr;
+		bnp->bvn_size = size;
+		bfs->bfs_stat.bfss_nfiles.value.ui32++;
+		bfs->bfs_stat.bfss_nbytes.value.ui64 += size;
+		bnp->bvn_attr.va_nblocks = P2ROUNDUP(size, 512) >> 9;
+		bnp->bvn_attr.va_size = size;
+	}
+
+	vn_exists(bnp->bvn_vnp);
+
+	return (0);
+}
+
+/*
+ * Given the address, size, and path a boot-time module would like, go through
+ * and create all of the directory entries that are required and then the file
+ * itself. If someone has passed in a module that has the same name as another
+ * one, we honor the first one.
+ */
+static int
+bootfs_construct_entry(bootfs_t *bfs, uintptr_t addr, uint64_t size,
+    const char *mname)
+{
+	char *sp;
+	size_t nlen;
+	int ret;
+	bootfs_node_t *nbnp;
+
+	const char *p = mname;
+	bootfs_node_t *bnp = bfs->bfs_rootvn;
+
+	if (*p == '\0')
+		return (EINVAL);
+
+	for (;;) {
+		/* First eliminate all leading / characters. */
+		while (*p == '/')
+			p++;
+
+		/* A name with all slashes or ending in a / */
+		if (*p == '\0')
+			return (EINVAL);
+
+		sp = strchr(p, '/');
+		if (sp == NULL)
+			break;
+		nlen = (ptrdiff_t)sp - (ptrdiff_t)p;
+		if (strncmp(p, ".", nlen) == 0) {
+			p = sp + 1;
+			continue;
+		}
+
+		if (strncmp(p, "..", nlen) == 0) {
+			bnp = bnp->bvn_parent;
+			p = sp + 1;
+			continue;
+		}
+
+		VERIFY(bootfs_mknode(bfs, bnp, &nbnp, p, nlen,
+		    &bootfs_vattr_dir, addr, size) == 0);
+		p = sp + 1;
+		bnp = nbnp;
+	}
+
+	nlen = strlen(p);
+	ret = bootfs_mknode(bfs, bnp, &nbnp, p, nlen, &bootfs_vattr_reg,
+	    addr, size);
+	if (ret != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * We're going to go through every boot time module and construct the
+ * appropriate vnodes for them now. Because there are very few of these that
+ * exist, generally on the order of a handful, we're going to create them all
+ * when the file system is initialized and then tear them all down when the
+ * module gets unloaded.
+ *
+ * The information about the modules is contained in properties on the root of
+ * the devinfo tree. Specifically there are three properties per module:
+ *
+ *   - module-size-%d	int64_t size, in bytes, of the boot time module.
+ *   - module-addr-%d	The address of the boot time module
+ *   - module-name-%d	The string name of the boot time module
+ *
+ * Note that the module-size and module-addr fields are always 64-bit values
+ * regardless of being on a 32-bit or 64-bit kernel. module-name is a string
+ * property.
+ *
+ * There is no property that indicates the total number of such modules. Modules
+ * start at 0 and work their way up incrementally. The first time we can't find
+ * a module or a property, then we stop.
+ */
+void
+bootfs_construct(bootfs_t *bfs)
+{
+	uint_t id = 0, ndata;
+	char paddr[64], psize[64], pname[64], *mname;
+	dev_info_t *root;
+	uchar_t *datap;
+	uint64_t size = 0, addr = 0;
+	int ret;
+
+	bootfs_mkroot(bfs);
+	root = ddi_root_node();
+
+	for (;;) {
+		if (id == UINT32_MAX)
+			break;
+
+		if (snprintf(paddr, sizeof (paddr), "module-addr-%d", id) >
+		    sizeof (paddr))
+			break;
+
+		if (snprintf(psize, sizeof (paddr), "module-size-%d", id) >
+		    sizeof (paddr))
+			break;
+
+		if (snprintf(pname, sizeof (paddr), "module-name-%d", id) >
+		    sizeof (paddr))
+			break;
+
+		if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root,
+		    DDI_PROP_DONTPASS, paddr, &datap, &ndata) !=
+		    DDI_PROP_SUCCESS)
+			break;
+
+		if (ndata == 8)
+			bcopy(datap, &addr, sizeof (uint64_t));
+		ddi_prop_free(datap);
+		if (ndata != 8)
+			break;
+
+		if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root,
+		    DDI_PROP_DONTPASS, psize, &datap, &ndata) !=
+		    DDI_PROP_SUCCESS)
+			break;
+		if (ndata == 8)
+			bcopy(datap, &size, sizeof (uint64_t));
+		ddi_prop_free(datap);
+		if (ndata != 8)
+			break;
+
+		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, root,
+		    DDI_PROP_DONTPASS, pname, &mname) != DDI_PROP_SUCCESS)
+			break;
+
+		ret = bootfs_construct_entry(bfs, addr, size, mname);
+		if (ret == EINVAL)
+			bfs->bfs_stat.bfss_ndiscards.value.ui32++;
+		if (ret == EEXIST)
+			bfs->bfs_stat.bfss_ndups.value.ui32++;
+		ddi_prop_free(mname);
+
+		id++;
+	}
+}
+
+void
+bootfs_destruct(bootfs_t *bfs)
+{
+	bootfs_node_t *bnp;
+
+	while ((bnp = list_remove_head(&bfs->bfs_nodes)) != NULL) {
+		ASSERT(bnp->bvn_vnp->v_count == 1);
+		VN_RELE(bnp->bvn_vnp);
+		kmem_free(bnp->bvn_name, strlen(bnp->bvn_name) + 1);
+		kmem_cache_free(bootfs_node_cache, bnp);
+	}
+}
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
new file mode 100644
index 0000000000..e642e86169
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
@@ -0,0 +1,321 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/modctl.h>
+#include <sys/types.h>
+#include <sys/mkdev.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/systm.h>
+#include <sys/id_space.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/policy.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+/*
+ * While booting, additional types of modules and files can be passed in to the
+ * loader. These include the familiar boot archive, as well as, a module hash
+ * and additional modules that are interpreted as files. As part of the handoff
+ * in early boot, information about these modules are saved as properties on the
+ * root of the devinfo tree, similar to other boot-time properties.
+ *
+ * This file system provides a read-only view of those additional files. Due to
+ * its limited scope, it has a slightly simpler construction than several other
+ * file systems. When mounted, it looks for the corresponding properties and
+ * creates bootfs_node_t's and vnodes for all of the corresponding files and
+ * directories that exist along the way. At this time, there are currently a
+ * rather small number of files passed in this way.
+ *
+ * This does lead to one behavior that folks used to other file systems might
+ * find peculiar. Because we are not always actively creating and destroying the
+ * required vnodes on demand, the count on the root vnode will not be going up
+ * accordingly with the existence of other vnodes. This means that a bootfs file
+ * system that is not in use will have all of its vnodes exist with a v_count of
+ * one.
+ */
+
+major_t bootfs_major;
+static int bootfs_fstype;
+static id_space_t *bootfs_idspace;
+static uint64_t bootfs_nactive;
+static kmutex_t bootfs_lock;
+
+static const char *bootfs_name = "bootfs";
+
+static int
+bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	int ret;
+	bootfs_t *bfs;
+	struct pathname dpn;
+	dev_t fsdev;
+
+	if ((ret = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (ret);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * We indicate that the backing store is bootfs. We don't want to use
+	 * swap, because folks might think that this is putting all the data
+	 * into memory ala tmpfs. Rather these modules are always in memory and
+	 * there's nothing to be done about that.
+	 */
+	vfs_setresource(vfsp, bootfs_name, 0);
+	bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI);
+	if (bfs == NULL)
+		return (ENOMEM);
+
+	ret = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+	if (ret != 0) {
+		kmem_free(bfs, sizeof (bfs));
+		return (ret);
+	}
+
+	bfs->bfs_minor = id_alloc(bootfs_idspace);
+	bfs->bfs_kstat = kstat_create_zone("bootfs", bfs->bfs_minor, "bootfs",
+	    "fs", KSTAT_TYPE_NAMED,
+	    sizeof (bootfs_stat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+	if (bfs->bfs_kstat == NULL) {
+		id_free(bootfs_idspace, bfs->bfs_minor);
+		pn_free(&dpn);
+		kmem_free(bfs, sizeof (bfs));
+		return (ENOMEM);
+	}
+	bfs->bfs_kstat->ks_data = &bfs->bfs_stat;
+
+	fsdev = makedevice(bootfs_major, bfs->bfs_minor);
+	bfs->bfs_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)bfs;
+	vfsp->vfs_fstype = bootfs_fstype;
+	vfsp->vfs_dev = fsdev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_RDONLY | VFS_NOSETUID | VFS_NOTRUNC |
+	    VFS_UNLINKABLE;
+	vfs_make_fsid(&vfsp->vfs_fsid, fsdev, bootfs_fstype);
+	bfs->bfs_mntpath = kmem_alloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	bcopy(dpn.pn_path, bfs->bfs_mntpath, dpn.pn_pathlen);
+	bfs->bfs_mntpath[dpn.pn_pathlen] = '\0';
+	pn_free(&dpn);
+	list_create(&bfs->bfs_nodes, sizeof (bootfs_node_t),
+	    offsetof(bootfs_node_t, bvn_alink));
+
+	kstat_named_init(&bfs->bfs_stat.bfss_nfiles, "nfiles",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&bfs->bfs_stat.bfss_ndirs, "ndirs",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&bfs->bfs_stat.bfss_nbytes, "nbytes",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&bfs->bfs_stat.bfss_ndups, "ndup",
+	    KSTAT_DATA_UINT32);
+	kstat_named_init(&bfs->bfs_stat.bfss_ndiscards, "ndiscard",
+	    KSTAT_DATA_UINT32);
+
+	bootfs_construct(bfs);
+
+	kstat_install(bfs->bfs_kstat);
+
+	return (0);
+}
+
+static int
+bootfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	int ret;
+	bootfs_t *bfs = vfsp->vfs_data;
+	bootfs_node_t *bnp;
+
+	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (ret);
+
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	for (bnp = list_head(&bfs->bfs_nodes); bnp != NULL;
+	    bnp = list_next(&bfs->bfs_nodes, bnp)) {
+		mutex_enter(&bnp->bvn_vnp->v_lock);
+		if (bnp->bvn_vnp->v_count > 1) {
+			mutex_exit(&bnp->bvn_vnp->v_lock);
+			return (EBUSY);
+		}
+		mutex_exit(&bnp->bvn_vnp->v_lock);
+	}
+
+	kstat_delete(bfs->bfs_kstat);
+	bootfs_destruct(bfs);
+	list_destroy(&bfs->bfs_nodes);
+	kmem_free(bfs->bfs_mntpath, strlen(bfs->bfs_mntpath) + 1);
+	id_free(bootfs_idspace, bfs->bfs_minor);
+	kmem_free(bfs, sizeof (bootfs_t));
+	return (0);
+}
+
+static int
+bootfs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	bootfs_t *bfs;
+
+	bfs = (bootfs_t *)vfsp->vfs_data;
+	*vpp = bfs->bfs_rootvn->bvn_vnp;
+	VN_HOLD(*vpp)
+
+	return (0);
+}
+
+static int
+bootfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	const bootfs_t *bfs = (bootfs_t *)vfsp;
+	dev32_t d32;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	sbp->f_blocks = bfs->bfs_stat.bfss_nbytes.value.ui64 >> PAGESHIFT;
+	sbp->f_bfree = 0;
+	sbp->f_bavail = 0;
+
+	sbp->f_files = bfs->bfs_stat.bfss_nfiles.value.ui32 +
+	    bfs->bfs_stat.bfss_ndirs.value.ui32;
+	sbp->f_ffree = 0;
+	sbp->f_favail = 0;
+
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strlcpy(sbp->f_basetype, bootfs_name, FSTYPSZ);
+	bzero(sbp->f_fstr, sizeof (sbp->f_fstr));
+
+	return (0);
+}
+
+static const fs_operation_def_t bootfs_vfsops_tmpl[] = {
+	VFSNAME_MOUNT,		{ .vfs_mount = bootfs_mount },
+	VFSNAME_UNMOUNT,	{ .vfs_unmount = bootfs_unmount },
+	VFSNAME_ROOT,		{ .vfs_root = bootfs_root },
+	VFSNAME_STATVFS,	{ .vfs_statvfs = bootfs_statvfs },
+	NULL,			NULL
+};
+
+static int
+bootfs_init(int fstype, char *name)
+{
+	int ret;
+
+	bootfs_fstype = fstype;
+	ASSERT(bootfs_fstype != 0);
+
+	ret = vfs_setfsops(fstype, bootfs_vfsops_tmpl, NULL);
+	if (ret != 0)
+		return (ret);
+
+	ret = vn_make_ops(name, bootfs_vnodeops_template, &bootfs_vnodeops);
+	if (ret != 0) {
+		(void) vfs_freevfsops_by_type(bootfs_fstype);
+		return (ret);
+	}
+
+	bootfs_major = getudev();
+	if (bootfs_major == (major_t)-1) {
+		cmn_err(CE_WARN, "bootfs_init: Can't get unique device number");
+		bootfs_major = 0;
+	}
+
+	bootfs_nactive = 0;
+	return (0);
+}
+
+static mntopts_t bootfs_mntopts = {
+	0, NULL
+};
+
+static vfsdef_t bootfs_vfsdef = {
+	VFSDEF_VERSION,
+	"bootfs",
+	bootfs_init,
+	VSW_HASPROTO|VSW_STATS,
+	&bootfs_mntopts
+};
+
+static struct modlfs bootfs_modlfs = {
+	&mod_fsops, "boot-time modules file system", &bootfs_vfsdef
+};
+
+static struct modlinkage bootfs_modlinkage = {
+	MODREV_1, &bootfs_modlfs, NULL
+};
+
+int
+_init(void)
+{
+	bootfs_node_cache = kmem_cache_create("bootfs_node_cache",
+	    sizeof (bootfs_node_t), 0, bootfs_node_constructor,
+	    bootfs_node_destructor, NULL, NULL, NULL, 0);
+	bootfs_idspace = id_space_create("bootfs_minors", 1, INT32_MAX);
+	mutex_init(&bootfs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	return (mod_install(&bootfs_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&bootfs_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	mutex_enter(&bootfs_lock);
+	if (bootfs_nactive > 0) {
+		mutex_exit(&bootfs_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&bootfs_lock);
+
+	err = mod_remove(&bootfs_modlinkage);
+	if (err != 0)
+		return (err);
+
+	(void) vfs_freevfsops_by_type(bootfs_fstype);
+	vn_freevnodeops(bootfs_vnodeops);
+	id_space_destroy(bootfs_idspace);
+	mutex_destroy(&bootfs_lock);
+	kmem_cache_destroy(bootfs_node_cache);
+	return (err);
+}
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vnops.c b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c
new file mode 100644
index 0000000000..f63d0a4f24
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c
@@ -0,0 +1,544 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * bootfs vnode operations
+ */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mman.h>
+#include <fs/fs_subr.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/dirent.h>
+#include <sys/uio.h>
+#include <vm/pvn.h>
+#include <vm/hat.h>
+#include <vm/seg_map.h>
+#include <vm/seg_vn.h>
+#include <sys/vmsystm.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+struct vnodeops *bootfs_vnodeops;
+
+/*ARGSUSED*/
+static int
+bootfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	int err;
+	ssize_t sres = uiop->uio_resid;
+	bootfs_node_t *bnp = vp->v_data;
+
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (uiop->uio_loffset < 0)
+		return (EINVAL);
+
+	if (uiop->uio_loffset >= bnp->bvn_size)
+		return (0);
+
+	err = 0;
+	while (uiop->uio_resid != 0) {
+		caddr_t base;
+		long offset, frem;
+		ulong_t poff, segoff;
+		size_t bytes;
+		int relerr;
+
+		offset = uiop->uio_loffset;
+		poff = offset & PAGEOFFSET;
+		bytes = MIN(PAGESIZE - poff, uiop->uio_resid);
+
+		frem = bnp->bvn_size - offset;
+		if (frem <= 0) {
+			err = 0;
+			break;
+		}
+
+		/* Don't read past EOF */
+		bytes = MIN(bytes, frem);
+
+		/*
+		 * Segmaps are likely larger than our page size, so make sure we
+		 * have the proper offfset into the resulting segmap data.
+		 */
+		segoff = (offset & PAGEMASK) & MAXBOFFSET;
+
+		base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK, bytes,
+		    1, S_READ);
+
+		err = uiomove(base + segoff + poff, bytes, UIO_READ, uiop);
+		relerr = segmap_release(segkmap, base, 0);
+
+		if (err == 0)
+			err = relerr;
+
+		if (err != 0)
+			break;
+	}
+
+	/* Even if we had an error in a partial read, return success */
+	if (uiop->uio_resid > sres)
+		err = 0;
+
+	gethrestime(&bnp->bvn_attr.va_atime);
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	uint32_t mask;
+	bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data;
+
+	mask = vap->va_mask;
+	bcopy(&bpn->bvn_attr, vap, sizeof (vattr_t));
+	vap->va_mask = mask;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	int shift = 0;
+	bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data;
+
+	if (crgetuid(cr) != bpn->bvn_attr.va_uid) {
+		shift += 3;
+		if (groupmember(bpn->bvn_attr.va_gid, cr) == 0)
+			shift += 3;
+	}
+
+	return (secpolicy_vnode_access2(cr, vp, bpn->bvn_attr.va_uid,
+	    bpn->bvn_attr.va_mode << shift, mode));
+}
+
+/*ARGSUSED*/
+static int
+bootfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	avl_index_t where;
+	bootfs_node_t sn, *bnp;
+	bootfs_node_t *bpp = (bootfs_node_t *)dvp->v_data;
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	if (bpp->bvn_attr.va_type != VDIR)
+		return (ENOTDIR);
+
+	if (*nm == '\0' || strcmp(nm, ".") == 0) {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+
+	if (strcmp(nm, "..") == 0) {
+		VN_HOLD(bpp->bvn_parent->bvn_vnp);
+		*vpp = bpp->bvn_parent->bvn_vnp;
+		return (0);
+	}
+
+	sn.bvn_name = nm;
+	bnp = avl_find(&bpp->bvn_dir, &sn, &where);
+	if (bnp == NULL)
+		return (ENOENT);
+
+	VN_HOLD(bnp->bvn_vnp);
+	*vpp = bnp->bvn_vnp;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data;
+	dirent64_t *dp;
+	void *buf;
+	ulong_t bsize, brem;
+	offset_t coff, roff;
+	int dlen, ret;
+	bootfs_node_t *dnp;
+	boolean_t first = B_TRUE;
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp != NULL)
+			*eofp = 1;
+		return (0);
+	}
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (!(uiop->uio_iov->iov_len > 0))
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	roff = uiop->uio_loffset;
+	coff = 0;
+	brem = bsize = uiop->uio_iov->iov_len;
+	buf = kmem_alloc(bsize, KM_SLEEP);
+	dp = buf;
+
+	/*
+	 * Recall that offsets here are done based on the name of the dirent
+	 * excluding the null terminator. Therefore `.` is always at 0, `..` is
+	 * always at 1, and then the first real dirent is at 3. This offset is
+	 * what's actually stored when we update the offset in the structure.
+	 */
+	if (roff == 0) {
+		dlen = DIRENT64_RECLEN(1);
+		if (first == B_TRUE) {
+			if (dlen > brem) {
+				kmem_free(buf, bsize);
+				return (EINVAL);
+			}
+			first = B_FALSE;
+		}
+		dp->d_ino = (ino64_t)bnp->bvn_attr.va_nodeid;
+		dp->d_off = 0;
+		dp->d_reclen = (ushort_t)dlen;
+		(void) strncpy(dp->d_name, ".", DIRENT64_NAMELEN(dlen));
+		dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+		brem -= dlen;
+	}
+
+	if (roff <= 1) {
+		dlen = DIRENT64_RECLEN(2);
+		if (first == B_TRUE) {
+			if (dlen > brem) {
+				kmem_free(buf, bsize);
+				return (EINVAL);
+			}
+			first = B_FALSE;
+		}
+		dp->d_ino = (ino64_t)bnp->bvn_parent->bvn_attr.va_nodeid;
+		dp->d_off = 1;
+		dp->d_reclen = (ushort_t)dlen;
+		(void) strncpy(dp->d_name, "..", DIRENT64_NAMELEN(dlen));
+		dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+		brem -= dlen;
+	}
+
+	coff = 3;
+	for (dnp = avl_first(&bnp->bvn_dir); dnp != NULL;
+	    dnp = AVL_NEXT(&bnp->bvn_dir, dnp)) {
+		size_t nlen = strlen(dnp->bvn_name);
+
+		if (roff > coff) {
+			coff += nlen;
+			continue;
+		}
+
+		dlen = DIRENT64_RECLEN(nlen);
+		if (dlen > brem) {
+			if (first == B_TRUE) {
+				kmem_free(buf, bsize);
+				return (EINVAL);
+			}
+			break;
+		}
+		first = B_FALSE;
+
+		dp->d_ino = (ino64_t)dnp->bvn_attr.va_nodeid;
+		dp->d_off = coff;
+		dp->d_reclen = (ushort_t)dlen;
+		(void) strncpy(dp->d_name, dnp->bvn_name,
+		    DIRENT64_NAMELEN(dlen));
+		dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+		brem -= dlen;
+		coff += nlen;
+	}
+
+	ret = uiomove(buf, (bsize - brem), UIO_READ, uiop);
+
+	if (ret == 0) {
+		if (dnp == NULL) {
+			coff++;
+			if (eofp != NULL)
+				*eofp = 1;
+		} else if (eofp != NULL) {
+			*eofp = 0;
+		}
+		uiop->uio_loffset = coff;
+	}
+	gethrestime(&bnp->bvn_attr.va_atime);
+	kmem_free(buf, bsize);
+	return (ret);
+}
+
+/*ARGSUSED*/
+static void
+bootfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+}
+
+/*ARGSUSED*/
+static int
+bootfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	if (write_lock != 0)
+		return (EINVAL);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+bootfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+}
+
+/*ARGSUSED*/
+static int
+bootfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data;
+	if (vp->v_type == VDIR)
+		return (0);
+	return ((*noffp < 0 || *noffp > bnp->bvn_size ? EINVAL : 0));
+}
+
+/*
+ * We need to fill in a single page of a vnode's memory based on the actual data
+ * from the kernel. We'll use this node's sliding window into physical memory
+ * and update one page at a time.
+ */
+/*ARGSUSED*/
+static int
+bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr)
+{
+	bootfs_node_t *bnp = vp->v_data;
+	page_t *pp, *fpp;
+	pfn_t pfn;
+
+	for (;;) {
+		/* Easy case where the page exists */
+		pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED);
+		if (pp != NULL) {
+			if (pl != NULL) {
+				pl[0] = pp;
+				pl[1] = NULL;
+			} else {
+				page_unlock(pp);
+			}
+			return (0);
+		}
+
+		pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg,
+		    addr);
+
+		/*
+		 * If we didn't get the page, that means someone else beat us to
+		 * creating this so we need to try again.
+		 */
+		if (pp != NULL)
+			break;
+	}
+
+	pfn = btop((bnp->bvn_addr + off) & PAGEMASK);
+	fpp = page_numtopp_nolock(pfn);
+
+	if (ppcopy(fpp, pp) == 0) {
+		pvn_read_done(pp, B_ERROR);
+		return (EIO);
+	}
+
+	if (pl != NULL) {
+		pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
+	} else {
+		pvn_io_done(pp);
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	int err;
+	bootfs_node_t *bnp = vp->v_data;
+
+	if (off + len > bnp->bvn_size + PAGEOFFSET)
+		return (EFAULT);
+
+	if (len <= PAGESIZE)
+		err = bootfs_getapage(vp, (u_offset_t)off, len, protp, pl,
+		    plsz, seg, addr, rw, cr);
+	else
+		err = pvn_getpages(bootfs_getapage, vp, (u_offset_t)off, len,
+		    protp, pl, plsz, seg, addr, rw, cr);
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	int ret;
+	segvn_crargs_t vn_a;
+
+#ifdef	_ILP32
+	if (len > MAXOFF_T)
+		return (ENOMEM);
+#endif
+
+	if (vp->v_flag & VNOMAP)
+		return (ENOSYS);
+
+	if (off < 0 || off > MAXOFFSET_T - off)
+		return (ENXIO);
+
+	if (vp->v_type != VREG)
+		return (ENODEV);
+
+	if (prot & PROT_WRITE)
+		return (ENOTSUP);
+
+	as_rangelock(as);
+	ret = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+	if (ret != 0) {
+		as_rangeunlock(as);
+		return (ret);
+	}
+
+	vn_a.vp = vp;
+	vn_a.offset = (u_offset_t)off;
+	vn_a.type = flags & MAP_TYPE;
+	vn_a.prot = prot;
+	vn_a.maxprot = maxprot;
+	vn_a.cred = cr;
+	vn_a.amp = NULL;
+	vn_a.flags = flags & ~MAP_TYPE;
+	vn_a.szc = 0;
+	vn_a.lgrp_mem_policy_flags = 0;
+
+	ret = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+	as_rangeunlock(as);
+	return (ret);
+
+}
+
+/*ARGSUSED*/
+static int
+bootfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (0);
+}
+
+static int
+bootfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int ret;
+
+	switch (cmd) {
+	case _PC_TIMESTAMP_RESOLUTION:
+		*valp = 1L;
+		ret = 0;
+		break;
+	default:
+		ret = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+
+	return (ret);
+}
+
+const fs_operation_def_t bootfs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = bootfs_open },
+	VOPNAME_CLOSE,		{ .vop_close = bootfs_close },
+	VOPNAME_READ,		{ .vop_read = bootfs_read },
+	VOPNAME_IOCTL,		{ .vop_ioctl = bootfs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = bootfs_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = bootfs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = bootfs_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = bootfs_readdir },
+	VOPNAME_INACTIVE,	{ .vop_inactive = bootfs_inactive },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = bootfs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = bootfs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = bootfs_seek },
+	VOPNAME_GETPAGE,	{ .vop_getpage = bootfs_getpage },
+	VOPNAME_MAP,		{ .vop_map = bootfs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = bootfs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = bootfs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = bootfs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_nosupport },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index 4eaf38f484..41441ec52d 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -41,8 +42,102 @@
 #include <sys/zone.h>
 #include <sys/dls.h>
 
+static const char *devnet_zpath = "/dev/net/zone/";
 struct vnodeops		*devnet_vnodeops;
 
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+	char *zname = NULL, *dup;
+	zone_t *zone;
+	int duplen;
+	zoneid_t zid;
+
+	/*
+	 * If in a non-global zone, always return it's zid no matter what the
+	 * node is.
+	 */
+	zid = getzoneid();
+	if (zid != GLOBAL_ZONEID)
+		return (zid);
+
+	/*
+	 * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+	 * we're targetting.
+	 */
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+		return (GLOBAL_ZONEID);
+
+	if (dv->sdev_vnode->v_type == VDIR) {
+		zone = zone_find_by_name(dv->sdev_name);
+	} else {
+		/* Non directories have the form /dev/net/zone/%z/%s */
+		dup = strdup(dv->sdev_path);
+		duplen = strlen(dup);
+		zname = strrchr(dup, '/');
+		*zname = '\0';
+		zname--;
+		zname = strrchr(dup, '/');
+		zname++;
+		zone = zone_find_by_name(zname);
+		kmem_free(dup, duplen + 1);
+	}
+	if (zone == NULL)
+		return (GLOBAL_ZONEID);
+	zid = zone->zone_id;
+	zone_rele(zone);
+	return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+	sdev_node_t *dv;
+	struct vattr va;
+	int ret;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	dv = sdev_cache_lookup(ddv, name);
+	if (dv != NULL) {
+		SDEV_SIMPLE_RELE(dv);
+		return (EEXIST);
+	}
+
+	va = *sdev_getdefault_attr(VDIR);
+	gethrestime(&va.va_atime);
+	va.va_mtime = va.va_atime;
+	va.va_ctime = va.va_atime;
+
+	ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+	if (ret != 0)
+		return (ret);
+	SDEV_SIMPLE_RELE(dv);
+	return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, dv->sdev_path) == 0)
+		return (SDEV_VTOR_VALID);
+
+	zonep = zone_find_by_name(dv->sdev_name);
+	if (zonep == NULL)
+		return (SDEV_VTOR_INVALID);
+	zone_rele(zonep);
+	return (SDEV_VTOR_VALID);
+}
+
 /*
  * Check if a net sdev_node is still valid - i.e. it represents a current
  * network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
 
 	ASSERT(dv->sdev_state == SDEV_READY);
 
-	if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+	if (dv->sdev_vnode->v_type == VDIR)
+		return (devnet_dirvalidate(dv));
+
+	if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+		ASSERT(SDEV_IS_GLOBAL(dv));
+		zoneid = devnet_nodetozone(dv);
+	} else {
+		zoneid = getzoneid();
+	}
+
+	if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
 		return (SDEV_VTOR_INVALID);
-	if (SDEV_IS_GLOBAL(dv))
+	if (zoneid == GLOBAL_ZONEID)
 		return (SDEV_VTOR_VALID);
-	zoneid = getzoneid();
 	return (zone_check_datalink(&zoneid, linkid) == 0 ?
 	    SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
 }
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
  * a net entry when the node is not found in the cache.
  */
 static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+    zoneid_t zid)
 {
 	timestruc_t now;
 	dev_t dev;
 	int error;
 
-	if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+	if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
 		sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
 		    "network node: %s\n", nm));
 		return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	struct sdev_node *ddv = VTOSDEV(dvp);
 	struct sdev_node *dv = NULL;
 	dls_dl_handle_t ddh = NULL;
+	zone_t *zone;
 	struct vattr vattr;
 	int nmlen;
 	int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	if (SDEVTOV(ddv)->v_type != VDIR)
 		return (ENOTDIR);
 
+	if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+		return (EPERM);
+
 	/*
 	 * Empty name or ., return node itself.
 	 */
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
 
 	/*
+	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 */
+	if (ddv->sdev_state == SDEV_ZOMBIE)
+		goto failed;
+
+	/*
 	 * directory cache lookup:
 	 */
 	if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 			goto found;
 	}
 
+	if (SDEV_IS_GLOBAL(ddv)) {
+		/*
+		 * Check for /dev/net/zone
+		 */
+		if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+		    ddv->sdev_path) == 0) {
+			(void) devnet_mkdir(ddv, nm);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+
+		/*
+		 * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+		 * its trailing slash.
+		 */
+		if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+			zone = zone_find_by_name(nm);
+			if (zone == NULL)
+				goto failed;
+			(void) devnet_mkdir(ddv, nm);
+			zone_rele(zone);
+			dv = sdev_cache_lookup(ddv, nm);
+			ASSERT(dv != NULL);
+			goto found;
+		}
+	} else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+		goto failed;
+	}
+
 	/*
-	 * ZOMBIED parent does not allow new node creation, bail out early.
+	 * We didn't find what we were looking for. What that is depends a lot
+	 * on what directory we're in.
 	 */
-	if (ddv->sdev_state == SDEV_ZOMBIE)
-		goto failed;
 
-	error = devnet_create_rvp(nm, &vattr, &ddh);
+	error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
 	if (error != 0)
 		goto failed;
 
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
 	if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
 		goto found;
 
-	if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+	if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
 		return (0);
 
 	ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
 	return (0);
 }
 
+/*
+ * Fill in all the entries for the current zone.
+ */
 static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
 {
-	sdev_node_t	*dv, *next;
 	datalink_id_t	linkid;
 
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	if (zid == GLOBAL_ZONEID) {
+		ASSERT(SDEV_IS_GLOBAL(ddv));
+		linkid = DATALINK_INVALID_LINKID;
+		do {
+			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+			if (linkid != DATALINK_INVALID_LINKID)
+				(void) devnet_filldir_datalink(linkid, ddv);
+		} while (linkid != DATALINK_INVALID_LINKID);
+	} else {
+		(void) zone_datalink_walk(zid,  devnet_filldir_datalink, ddv);
+	}
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+	sdev_node_t *ddv = arg;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	(void) devnet_mkdir(ddv, zonep->zone_name);
+	return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+	zone_t *zonep;
+	char *path = "/dev/net/zone";
+
+	if (strcmp(path, ddv->sdev_path) == 0) {
+		(void) zone_walk(devnet_fillzdir_cb, ddv);
+		return;
+	}
+
+	zonep = zone_find_by_name(ddv->sdev_name);
+	if (zonep == NULL)
+		return;
+	devnet_fillzone(ddv, zonep->zone_id);
+	zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+	int ret;
+	sdev_node_t *dv, *next;
+
 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
 	if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
 		rw_exit(&ddv->sdev_contents);
 		rw_enter(&ddv->sdev_contents, RW_WRITER);
+		if (ddv->sdev_state == SDEV_ZOMBIE) {
+			rw_exit(&ddv->sdev_contents);
+			return;
+		}
 	}
 
 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv)
 
 		if (SDEVTOV(dv)->v_count > 0)
 			continue;
+
 		SDEV_HOLD(dv);
+
+		/*
+		 * Clean out everything underneath before we remove ourselves.
+		 */
+		if (SDEVTOV(ddv)->v_type == VDIR) {
+			ret = sdev_cleandir(dv, NULL, 0);
+			ASSERT(ret == 0);
+		}
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
 		SDEV_RELE(dv);
 	}
 
+	if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+		devnet_fillzdir(ddv);
+		goto done;
+	}
+
 	if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
 		goto done;
 
 	if (SDEV_IS_GLOBAL(ddv)) {
-		linkid = DATALINK_INVALID_LINKID;
-		do {
-			linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
-			    DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
-			if (linkid != DATALINK_INVALID_LINKID)
-				(void) devnet_filldir_datalink(linkid, ddv);
-		} while (linkid != DATALINK_INVALID_LINKID);
+		devnet_fillzone(ddv, GLOBAL_ZONEID);
+		(void) devnet_mkdir(ddv, "zone");
 	} else {
-		(void) zone_datalink_walk(getzoneid(),
-		    devnet_filldir_datalink, ddv);
+		devnet_fillzone(ddv, getzoneid());
 	}
 
 	ddv->sdev_flags &= ~SDEV_BUILD;
-
 done:
 	rw_downgrade(&ddv->sdev_contents);
 }
@@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
 
 	ASSERT(sdvp);
 
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
 	if (uiop->uio_offset == 0)
 		devnet_filldir(sdvp);
 
diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c
new file mode 100644
index 0000000000..885191175f
--- /dev/null
+++ b/usr/src/uts/common/fs/dev/sdev_plugin.c
@@ -0,0 +1,913 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Dynamic directory plugin interface for sdev.
+ *
+ * The sdev plugin interfaces provides a means for a dynamic directory based on
+ * in-kernel state to be simply created. Traditionally, dynamic directories were
+ * built into sdev itself. While these legacy plugins are useful, it makes more
+ * sense for these pieces of functionality to live with the individual drivers.
+ *
+ * The plugin interface requires folks to implement three interfaces and
+ * provides a series of callbacks that can be made in the context of those
+ * interfaces to interrogate the sdev_node_t without having to leak
+ * implementation details of the sdev_node_t. These interfaces are:
+ *
+ *   o spo_validate
+ *
+ *   Given a particular node, answer the question as to whether or not this
+ *   entry is still valid. Here, plugins should use the name and the dev_t
+ *   associated with the node to verify that it matches something that still
+ *   exists.
+ *
+ *   o spo_filldir
+ *
+ *   Fill all the entries inside of a directory. Note that some of these entries
+ *   may already exist.
+ *
+ *   o spo_inactive
+ *
+ *   The given node is no longer being used. This allows the consumer to
+ *   potentially tear down anything that was being held open related to this.
+ *   Note that this only fires when the given sdev_node_t becomes a zombie.
+ *
+ * During these callbacks a consumer is not allowed to register or unregister a
+ * plugin, especially their own. They may call the sdev_ctx style functions. All
+ * callbacks fire in a context where blocking is allowed (eg. the spl is below
+ * LOCK_LEVEL).
+ *
+ * When a plugin is added, we create its directory in the global zone. By doing
+ * that, we ensure that something isn't already there and that nothing else can
+ * come along and try and create something without our knowledge. We only have
+ * to create it in the GZ and not for all other instances of sdev because an
+ * instance of sdev that isn't at /dev does not have dynamic directories, and
+ * second, any instance of sdev present in a non-global zone cannot create
+ * anything, therefore we know that by it not being in the global zone's
+ * instance of sdev that we're good to go.
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The global sdev_plugin_lock must be held before any of the individual
+ * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
+ * it is not legal to take any holds on any sdev_node_t or to grab the
+ * sdev_node_t`contents_lock in any way.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/fs/sdev_plugin.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+
+kmutex_t sdev_plugin_lock;
+list_t sdev_plugin_list;
+kmem_cache_t *sdev_plugin_cache;
+struct vnodeops *sdev_plugin_vnops;
+
+#define	SDEV_PLUGIN_NAMELEN	64
+
+typedef struct sdev_plugin {
+	list_node_t sp_link;
+	char sp_name[SDEV_PLUGIN_NAMELEN];	/* E */
+	int sp_nflags;				/* E */
+	struct vnodeops *sp_vnops;		/* E */
+	sdev_plugin_ops_t *sp_pops;		/* E */
+	boolean_t sp_islegacy;			/* E */
+	int (*sp_lvtor)(sdev_node_t *);		/* E */
+	kmutex_t sp_lock;			/* Protects everything below */
+	kcondvar_t sp_nodecv;
+	size_t sp_nnodes;
+} sdev_plugin_t;
+
+/* ARGSUSED */
+static int
+sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
+{
+	sdev_plugin_t *spp = buf;
+	mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
+	cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+sdev_plugin_cache_destructor(void *buf, void *arg)
+{
+	sdev_plugin_t *spp = buf;
+	cv_destroy(&spp->sp_nodecv);
+	mutex_destroy(&spp->sp_lock);
+}
+
+enum vtype
+sdev_ctx_vtype(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_vnode->v_type);
+}
+
+const char *
+sdev_ctx_path(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_path);
+}
+
+const char *
+sdev_ctx_name(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_name);
+}
+
+/*
+ * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
+ */
+sdev_ctx_flags_t
+sdev_ctx_flags(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	return (sdp->sdev_flags & SDEV_GLOBAL);
+}
+
+/*
+ * Return some amount of private data specific to the vtype. In the case of a
+ * character or block device this is the device number.
+ */
+const void *
+sdev_ctx_vtype_data(sdev_ctx_t ctx)
+{
+	sdev_node_t *sdp = (sdev_node_t *)ctx;
+	void *ret;
+
+	ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+	switch (sdp->sdev_vnode->v_type) {
+	case VCHR:
+	case VBLK:
+		ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev);
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
+ */
+static int
+sdev_plugin_name_isvalid(const char *c, int buflen)
+{
+	int i;
+
+	for (i = 0; i < buflen; i++, c++) {
+		if (*c == '\0')
+			return (1);
+
+		if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
+			return (0);
+	}
+	/* Never found a null terminator */
+	return (0);
+}
+
+static int
+sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
+    vattr_t *vap)
+{
+	int ret;
+	sdev_node_t *svp;
+
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+	ASSERT(spp != NULL);
+	svp = sdev_cache_lookup(sdvp, name);
+	if (svp != NULL) {
+		SDEV_SIMPLE_RELE(svp);
+		return (EEXIST);
+	}
+
+	ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
+	    SDEV_READY);
+	if (ret != 0)
+		return (ret);
+	SDEV_SIMPLE_RELE(svp);
+
+	return (0);
+}
+
+/*
+ * Plugin node creation callbacks
+ */
+int
+sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
+{
+	sdev_node_t *sdvp;
+	timestruc_t now;
+	struct vattr vap;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+		return (EINVAL);
+
+	sdvp = (sdev_node_t *)ctx;
+	ASSERT(sdvp->sdev_private != NULL);
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+	vap = *sdev_getdefault_attr(VDIR);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+
+	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+int
+sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
+{
+	sdev_node_t *sdvp;
+	timestruc_t now;
+	struct vattr vap;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+		return (EINVAL);
+
+	sdvp = (sdev_node_t *)ctx;
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+	if (mode != S_IFCHR && mode != S_IFBLK)
+		return (EINVAL);
+
+	ASSERT(sdvp->sdev_private != NULL);
+
+	vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+	vap.va_rdev = dev;
+	vap.va_mode = mode | 0666;
+
+	/* Despite the similar name, this is in fact a different function */
+	return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+
+}
+
+static int
+sdev_plugin_validate(sdev_node_t *sdp)
+{
+	int ret;
+	sdev_plugin_t *spp;
+
+	ASSERT(sdp->sdev_private != NULL);
+	spp = sdp->sdev_private;
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+	rw_enter(&sdp->sdev_contents, RW_READER);
+	ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
+	rw_exit(&sdp->sdev_contents);
+	return (ret);
+}
+
+static void
+sdev_plugin_validate_dir(sdev_node_t *sdvp)
+{
+	int ret;
+	sdev_node_t *svp, *next;
+
+	ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+	for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
+
+		next = SDEV_NEXT_ENTRY(sdvp, svp);
+		ASSERT(svp->sdev_state != SDEV_ZOMBIE);
+		/* skip nodes that aren't ready */
+		if (svp->sdev_state == SDEV_INIT)
+			continue;
+
+		switch (sdev_plugin_validate(svp)) {
+		case SDEV_VTOR_VALID:
+		case SDEV_VTOR_SKIP:
+			continue;
+		case SDEV_VTOR_INVALID:
+		case SDEV_VTOR_STALE:
+			break;
+		}
+
+		SDEV_HOLD(svp);
+
+		/*
+		 * Clean out everything underneath this node before we
+		 * remove it.
+		 */
+		if (svp->sdev_vnode->v_type == VDIR) {
+			ret = sdev_cleandir(svp, NULL, 0);
+			ASSERT(ret == 0);
+		}
+		/* remove the cache node */
+		(void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
+		    SDEV_CACHE_DELETE);
+		SDEV_RELE(svp);
+	}
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+    int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+	int ret;
+	sdev_node_t *sdvp = VTOSDEV(dvp);
+	sdev_plugin_t *spp;
+
+	ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
+
+	/* Sanity check we're not a zombie before we do anyting else */
+	if (sdvp->sdev_state == SDEV_ZOMBIE)
+		return (ENOENT);
+
+	spp = sdvp->sdev_private;
+	ASSERT(spp != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
+	if (uiop->uio_offset == 0) {
+		/*
+		 * We upgrade to a write lock and grab the plugin's lock along
+		 * the way. We're almost certainly going to get creation
+		 * callbacks, so this is the only safe way to go.
+		 */
+		if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
+			rw_exit(&sdvp->sdev_contents);
+			rw_enter(&sdvp->sdev_contents, RW_WRITER);
+			if (sdvp->sdev_state == SDEV_ZOMBIE) {
+				rw_downgrade(&sdvp->sdev_contents);
+				return (ENOENT);
+			}
+		}
+
+		sdev_plugin_validate_dir(sdvp);
+		ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+		rw_downgrade(&sdvp->sdev_contents);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+/*
+ * If we don't have a callback function that returns a failure, then sdev will
+ * try to create a node for us which violates all of our basic assertions. To
+ * work around that we create our own callback for devname_lookup_func which
+ * always returns ENOENT as at this point either it was created with the filldir
+ * callback or it was not.
+ */
+/*ARGSUSED*/
+static int
+sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
+    void *unused, char *unused2)
+{
+	return (ENOENT);
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+	int ret;
+	sdev_node_t *sdvp;
+	sdev_plugin_t *spp;
+
+	/* execute access is required to search the directory */
+	if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+		return (ret);
+
+	sdvp = VTOSDEV(dvp);
+	spp = sdvp->sdev_private;
+	ASSERT(spp != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	ASSERT(spp->sp_pops != NULL);
+
+	if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+		return (EPERM);
+
+	/*
+	 * Go straight for the write lock.
+	 */
+	rw_enter(&sdvp->sdev_contents, RW_WRITER);
+	if (sdvp->sdev_state == SDEV_ZOMBIE) {
+		rw_exit(&sdvp->sdev_contents);
+		return (ENOENT);
+	}
+	sdev_plugin_validate_dir(sdvp);
+	ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+	rw_exit(&sdvp->sdev_contents);
+	if (ret != 0)
+		return (ret);
+
+	return (devname_lookup_func(sdvp, nm, vpp, cred,
+	    sdev_plugin_vop_lookup_cb, SDEV_VATTR));
+}
+
+/*
+ * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
+ * to zero, but isn't necessairily a zombie yet. As such, to make things easier
+ * for users, we only fire the inactive callback when the node becomes a zombie
+ * and thus will be torn down here.
+ */
+static void
+sdev_plugin_vop_inactive_cb(struct vnode *dvp)
+{
+	sdev_node_t *sdp = VTOSDEV(dvp);
+	sdev_plugin_t *spp = sdp->sdev_private;
+
+	rw_enter(&sdp->sdev_contents, RW_READER);
+	if (sdp->sdev_state != SDEV_ZOMBIE) {
+		rw_exit(&sdp->sdev_contents);
+		return;
+	}
+	spp->sp_pops->spo_inactive((uintptr_t)sdp);
+	mutex_enter(&spp->sp_lock);
+	VERIFY(spp->sp_nnodes > 0);
+	spp->sp_nnodes--;
+	cv_signal(&spp->sp_nodecv);
+	mutex_exit(&spp->sp_lock);
+	rw_exit(&sdp->sdev_contents);
+}
+
+/*ARGSUSED*/
+static void
+sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
+    caller_context_t *ct)
+{
+	sdev_node_t *sdp = VTOSDEV(dvp);
+	sdev_plugin_t *spp = sdp->sdev_private;
+	ASSERT(sdp->sdev_private != NULL);
+	ASSERT(spp->sp_islegacy == B_FALSE);
+	devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
+}
+
+const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
+	VOPNAME_READDIR,	{ .vop_readdir = sdev_plugin_vop_readdir },
+	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_plugin_vop_lookup },
+	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_plugin_vop_inactive },
+	VOPNAME_CREATE,		{ .error = fs_nosys },
+	VOPNAME_REMOVE,		{ .error = fs_nosys },
+	VOPNAME_MKDIR,		{ .error = fs_nosys },
+	VOPNAME_RMDIR,		{ .error = fs_nosys },
+	VOPNAME_SYMLINK,	{ .error = fs_nosys },
+	VOPNAME_SETSECATTR,	{ .error = fs_nosys },
+	NULL,			NULL
+};
+
+/*
+ * construct a new template with overrides from vtab
+ */
+static fs_operation_def_t *
+sdev_merge_vtab(const fs_operation_def_t tab[])
+{
+	fs_operation_def_t *new;
+	const fs_operation_def_t *tab_entry;
+
+	/* make a copy of standard vnode ops table */
+	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
+	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
+
+	/* replace the overrides from tab */
+	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
+		fs_operation_def_t *std_entry = new;
+		while (std_entry->name) {
+			if (strcmp(tab_entry->name, std_entry->name) == 0) {
+				std_entry->func = tab_entry->func;
+				break;
+			}
+			std_entry++;
+		}
+	}
+
+	return (new);
+}
+
+/* free memory allocated by sdev_merge_vtab */
+static void
+sdev_free_vtab(fs_operation_def_t *new)
+{
+	kmem_free(new, sdev_vnodeops_tbl_size);
+}
+
+/*
+ * Register a new plugin.
+ */
+sdev_plugin_hdl_t
+sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
+{
+	int ret, err;
+	sdev_plugin_t *spp, *iter;
+	vnode_t *vp, *nvp;
+	sdev_node_t *sdp, *slp;
+	timestruc_t now;
+	struct vattr vap;
+
+	/*
+	 * Some consumers don't care about why they failed. To keep the code
+	 * simple, we'll just pretend they gave us something.
+	 */
+	if (errp == NULL)
+		errp = &err;
+
+	if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if (ops->spo_version != 1) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
+	    ops->spo_inactive == NULL) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+	(void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
+
+	spp->sp_pops = ops;
+	spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
+	if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
+		spp->sp_nflags |= SDEV_NO_NCACHE;
+	if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
+		spp->sp_nflags |= SDEV_SUBDIR;
+	spp->sp_vnops = sdev_plugin_vnops;
+	spp->sp_islegacy = B_FALSE;
+	spp->sp_lvtor = NULL;
+	spp->sp_nnodes = 0;
+
+	/*
+	 * Make sure it's unique, nothing exists with this name already, and add
+	 * it to the list. We also need to go through and grab the sdev
+	 * root node as we cannot grab any sdev node locks once we've grabbed
+	 * the sdev_plugin_lock. We effectively assert that if a directory is
+	 * not present in the GZ's /dev, then it doesn't exist in any of the
+	 * local zones.
+	 */
+	ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1);
+	if (ret != 0) {
+		*errp = ret;
+		kmem_cache_free(sdev_plugin_cache, spp);
+		return (NULL);
+	}
+	/* Make sure we have the real vnode */
+	if (VOP_REALVP(vp, &nvp, NULL) == 0) {
+		VN_HOLD(nvp);
+		VN_RELE(vp);
+		vp = nvp;
+		nvp = NULL;
+	}
+	VERIFY(vp->v_op == sdev_vnodeops);
+	sdp = VTOSDEV(vp);
+	rw_enter(&sdp->sdev_contents, RW_WRITER);
+	slp = sdev_cache_lookup(sdp, spp->sp_name);
+	if (slp != NULL) {
+		SDEV_RELE(slp);
+		rw_exit(&sdp->sdev_contents);
+		VN_RELE(vp);
+		*errp = EEXIST;
+		kmem_cache_free(sdev_plugin_cache, spp);
+		return (NULL);
+	}
+
+	mutex_enter(&sdev_plugin_lock);
+	for (iter = list_head(&sdev_plugin_list); iter != NULL;
+	    iter = list_next(&sdev_plugin_list, iter)) {
+		if (strcmp(spp->sp_name, iter->sp_name) == 0) {
+			mutex_exit(&sdev_plugin_lock);
+			rw_exit(&sdp->sdev_contents);
+			VN_RELE(vp);
+			*errp = EEXIST;
+			kmem_cache_free(sdev_plugin_cache, spp);
+			return (NULL);
+		}
+	}
+
+	list_insert_tail(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	/*
+	 * Now go ahead and create the top level directory for the global zone.
+	 */
+	vap = *sdev_getdefault_attr(VDIR);
+	gethrestime(&now);
+	vap.va_atime = now;
+	vap.va_mtime = now;
+	vap.va_ctime = now;
+
+	(void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
+
+	rw_exit(&sdp->sdev_contents);
+	VN_RELE(vp);
+
+	return ((sdev_plugin_hdl_t)spp);
+}
+
+static void
+sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
+{
+	sdev_plugin_t *spp = arg;
+	sdev_node_t *sdp;
+
+	rw_enter(&rdp->sdev_contents, RW_WRITER);
+	sdp = sdev_cache_lookup(rdp, spp->sp_name);
+	/* If it doesn't exist, we're done here */
+	if (sdp == NULL) {
+		rw_exit(&rdp->sdev_contents);
+		return;
+	}
+
+	/*
+	 * We first delete the directory before recursively marking everything
+	 * else stale. This ordering should ensure that we don't accidentally
+	 * miss anything.
+	 */
+	sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
+	sdev_stale(sdp);
+	SDEV_RELE(sdp);
+	rw_exit(&rdp->sdev_contents);
+}
+
+/*
+ * Remove a plugin. This will block until everything has become a zombie, thus
+ * guaranteeing the caller that nothing will call into them again once this call
+ * returns. While the call is ongoing, it could be called into. Note that while
+ * this is ongoing, it will block other mounts.
+ */
+int
+sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
+{
+	sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
+	if (spp->sp_islegacy)
+		return (EINVAL);
+
+	mutex_enter(&sdev_plugin_lock);
+	list_remove(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
+	mutex_enter(&spp->sp_lock);
+	while (spp->sp_nnodes > 0)
+		cv_wait(&spp->sp_nodecv, &spp->sp_lock);
+	mutex_exit(&spp->sp_lock);
+	kmem_cache_free(sdev_plugin_cache, spp);
+	return (0);
+}
+
+/*
+ * Register an old sdev style plugin to deal with what used to be in the vtab.
+ */
+static int
+sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
+{
+	sdev_plugin_t *spp;
+
+	spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+	(void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
+	spp->sp_islegacy = B_TRUE;
+	spp->sp_pops = NULL;
+	spp->sp_nflags = vtp->vt_flags;
+	spp->sp_lvtor = vtp->vt_vtor;
+	spp->sp_nnodes = 0;
+
+	if (vtp->vt_service != NULL) {
+		fs_operation_def_t *templ;
+		templ = sdev_merge_vtab(vtp->vt_service);
+		if (vn_make_ops(vtp->vt_name,
+		    (const fs_operation_def_t *)templ,
+		    &spp->sp_vnops) != 0) {
+			cmn_err(CE_WARN, "%s: malformed vnode ops\n",
+			    vtp->vt_name);
+			sdev_free_vtab(templ);
+			kmem_cache_free(sdev_plugin_cache, spp);
+			return (1);
+		}
+
+		if (vtp->vt_global_vops) {
+			*(vtp->vt_global_vops) = spp->sp_vnops;
+		}
+
+		sdev_free_vtab(templ);
+	} else {
+		spp->sp_vnops = sdev_vnodeops;
+	}
+
+	/*
+	 * No need to check for EEXIST here. These are loaded as a part of the
+	 * sdev's initialization function. Further, we don't have to create them
+	 * as that's taken care of in sdev's mount for the GZ.
+	 */
+	mutex_enter(&sdev_plugin_lock);
+	list_insert_tail(&sdev_plugin_list, spp);
+	mutex_exit(&sdev_plugin_lock);
+
+	return (0);
+}
+
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
+static sdev_plugin_t *
+sdev_match(sdev_node_t *dv)
+{
+	int vlen;
+	const char *path;
+	sdev_plugin_t *spp;
+
+	if (strlen(dv->sdev_path) <= 5)
+		return (NULL);
+
+	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+		return (NULL);
+	path = dv->sdev_path + 5;
+
+	mutex_enter(&sdev_plugin_lock);
+
+	for (spp = list_head(&sdev_plugin_list); spp != NULL;
+	    spp = list_next(&sdev_plugin_list, spp)) {
+		if (strcmp(spp->sp_name, path) == 0) {
+			mutex_exit(&sdev_plugin_lock);
+			return (spp);
+		}
+
+		if (spp->sp_nflags & SDEV_SUBDIR) {
+			vlen = strlen(spp->sp_name);
+			if ((strncmp(spp->sp_name, path,
+			    vlen - 1) == 0) && path[vlen] == '/') {
+				mutex_exit(&sdev_plugin_lock);
+				return (spp);
+			}
+
+		}
+	}
+
+	mutex_exit(&sdev_plugin_lock);
+	return (NULL);
+}
+
+void
+sdev_set_no_negcache(sdev_node_t *dv)
+{
+	char *path;
+	sdev_plugin_t *spp;
+
+	ASSERT(dv->sdev_path);
+	path = dv->sdev_path + strlen("/dev/");
+
+	mutex_enter(&sdev_plugin_lock);
+	for (spp = list_head(&sdev_plugin_list); spp != NULL;
+	    spp = list_next(&sdev_plugin_list, spp)) {
+		if (strcmp(spp->sp_name, path) == 0) {
+			if (spp->sp_nflags & SDEV_NO_NCACHE)
+				dv->sdev_flags |= SDEV_NO_NCACHE;
+			break;
+		}
+	}
+	mutex_exit(&sdev_plugin_lock);
+}
+
+struct vnodeops *
+sdev_get_vop(sdev_node_t *dv)
+{
+	char *path;
+	sdev_plugin_t *spp;
+
+	path = dv->sdev_path;
+	ASSERT(path);
+
+	/* gets the relative path to /dev/ */
+	path += 5;
+
+	if ((spp = sdev_match(dv)) != NULL) {
+		dv->sdev_flags |= spp->sp_nflags;
+		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+			dv->sdev_flags |= SDEV_PERSIST;
+		return (spp->sp_vnops);
+	}
+
+	/* child inherits the persistence of the parent */
+	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
+		dv->sdev_flags |= SDEV_PERSIST;
+	return (sdev_vnodeops);
+}
+
+void *
+sdev_get_vtor(sdev_node_t *dv)
+{
+	sdev_plugin_t *spp;
+
+	if (dv->sdev_private == NULL) {
+		spp = sdev_match(dv);
+		if (spp == NULL)
+			return (NULL);
+	} else {
+		spp = dv->sdev_private;
+	}
+
+	if (spp->sp_islegacy)
+		return ((void *)spp->sp_lvtor);
+	else
+		return ((void *)sdev_plugin_validate);
+}
+
+void
+sdev_plugin_nodeready(sdev_node_t *sdp)
+{
+	sdev_plugin_t *spp;
+
+	ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
+	ASSERT(sdp->sdev_private == NULL);
+
+	spp = sdev_match(sdp);
+	if (spp == NULL)
+		return;
+	if (spp->sp_islegacy)
+		return;
+	sdp->sdev_private = spp;
+	mutex_enter(&spp->sp_lock);
+	spp->sp_nnodes++;
+	mutex_exit(&spp->sp_lock);
+}
+
+int
+sdev_plugin_init(void)
+{
+	sdev_vop_table_t *vtp;
+	fs_operation_def_t *templ;
+
+	sdev_plugin_cache = kmem_cache_create("sdev_plugin",
+	    sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
+	    sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
+	if (sdev_plugin_cache == NULL)
+		return (1);
+	mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
+	list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
+	    offsetof(sdev_plugin_t, sp_link));
+
+	/*
+	 * Register all of the legacy vnops
+	 */
+	for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
+		if (sdev_plugin_register_legacy(vtp) != 0)
+			return (1);
+
+	templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
+	if (vn_make_ops("sdev_plugin",
+	    (const fs_operation_def_t *)templ,
+	    &sdev_plugin_vnops) != 0) {
+		sdev_free_vtab(templ);
+		return (1);
+	}
+
+	sdev_free_vtab(templ);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index b4b27e6285..16439a66a6 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = {
 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
 int		devtype;		/* fstype */
 
-/* static */
-static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_negcache(struct sdev_node *);
-static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
-static void sdev_free_vtab(fs_operation_def_t *);
-
 static void
 sdev_prof_free(struct sdev_node *dv)
 {
@@ -318,6 +312,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
 	/* overwritten for VLNK nodes */
 	dv->sdev_symlink = NULL;
+	list_link_init(&dv->sdev_plist);
 
 	vp = SDEVTOV(dv);
 	vn_reinit(vp);
@@ -406,6 +401,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
 	} else {
 		dv->sdev_nlink = 1;
 	}
+	sdev_plugin_nodeready(dv);
 
 	if (!(SDEV_IS_GLOBAL(dv))) {
 		dv->sdev_origin = (struct sdev_node *)args;
@@ -502,37 +498,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
 	return (dv);
 }
 
-/* directory dependent vop table */
-struct sdev_vop_table {
-	char *vt_name;				/* subdirectory name */
-	const fs_operation_def_t *vt_service;	/* vnodeops table */
-	struct vnodeops *vt_vops;		/* constructed vop */
-	struct vnodeops **vt_global_vops;	/* global container for vop */
-	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
-	int vt_flags;
-};
-
-/*
- * A nice improvement would be to provide a plug-in mechanism
- * for this table instead of a const table.
- */
-static struct sdev_vop_table vtab[] =
-{
-	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
+struct sdev_vop_table vtab[] = {
+	{ "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate,
 	SDEV_DYNAMIC | SDEV_VTOR },
 
-	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
+	{ "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate,
 	SDEV_DYNAMIC | SDEV_VTOR },
 
-	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+	{ "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops,
 	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
 
-	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
+	{ "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE },
 
-	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
-	SDEV_DYNAMIC | SDEV_VTOR },
+	{ "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate,
+	SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
 
-	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
+	{ "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops,
 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
 
 	/*
@@ -547,132 +528,14 @@ static struct sdev_vop_table vtab[] =
 	 * preventing a mkdir.
 	 */
 
-	{ "lofi", NULL, NULL, NULL, NULL,
+	{ "lofi", NULL, NULL, NULL,
 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
-	{ "rlofi", NULL, NULL, NULL, NULL,
+	{ "rlofi", NULL, NULL, NULL,
 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
 
-	{ NULL, NULL, NULL, NULL, NULL, 0}
+	{ NULL, NULL, NULL, NULL, 0}
 };
 
-/*
- * We need to match off of the sdev_path, not the sdev_name. We are only allowed
- * to exist directly under /dev.
- */
-struct sdev_vop_table *
-sdev_match(struct sdev_node *dv)
-{
-	int vlen;
-	int i;
-	const char *path;
-
-	if (strlen(dv->sdev_path) <= 5)
-		return (NULL);
-
-	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
-		return (NULL);
-	path = dv->sdev_path + 5;
-
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, path) == 0)
-			return (&vtab[i]);
-		if (vtab[i].vt_flags & SDEV_SUBDIR) {
-			vlen = strlen(vtab[i].vt_name);
-			if ((strncmp(vtab[i].vt_name, path,
-			    vlen - 1) == 0) && path[vlen] == '/')
-				return (&vtab[i]);
-		}
-
-	}
-	return (NULL);
-}
-
-/*
- *  sets a directory's vnodeops if the directory is in the vtab;
- */
-static struct vnodeops *
-sdev_get_vop(struct sdev_node *dv)
-{
-	struct sdev_vop_table *vtp;
-	char *path;
-
-	path = dv->sdev_path;
-	ASSERT(path);
-
-	/* gets the relative path to /dev/ */
-	path += 5;
-
-	/* gets the vtab entry it matches */
-	if ((vtp = sdev_match(dv)) != NULL) {
-		dv->sdev_flags |= vtp->vt_flags;
-		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
-		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
-			dv->sdev_flags |= SDEV_PERSIST;
-
-		if (vtp->vt_vops) {
-			if (vtp->vt_global_vops)
-				*(vtp->vt_global_vops) = vtp->vt_vops;
-
-			return (vtp->vt_vops);
-		}
-
-		if (vtp->vt_service) {
-			fs_operation_def_t *templ;
-			templ = sdev_merge_vtab(vtp->vt_service);
-			if (vn_make_ops(vtp->vt_name,
-			    (const fs_operation_def_t *)templ,
-			    &vtp->vt_vops) != 0) {
-				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
-				    vtp->vt_name);
-				/*NOTREACHED*/
-			}
-			if (vtp->vt_global_vops) {
-				*(vtp->vt_global_vops) = vtp->vt_vops;
-			}
-			sdev_free_vtab(templ);
-
-			return (vtp->vt_vops);
-		}
-
-		return (sdev_vnodeops);
-	}
-
-	/* child inherits the persistence of the parent */
-	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
-		dv->sdev_flags |= SDEV_PERSIST;
-
-	return (sdev_vnodeops);
-}
-
-static void
-sdev_set_no_negcache(struct sdev_node *dv)
-{
-	int i;
-	char *path;
-
-	ASSERT(dv->sdev_path);
-	path = dv->sdev_path + strlen("/dev/");
-
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, path) == 0) {
-			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
-				dv->sdev_flags |= SDEV_NO_NCACHE;
-			break;
-		}
-	}
-}
-
-void *
-sdev_get_vtor(struct sdev_node *dv)
-{
-	struct sdev_vop_table *vtp;
-
-	vtp = sdev_match(dv);
-	if (vtp)
-		return ((void *)vtp->vt_vtor);
-	else
-		return (NULL);
-}
 
 /*
  * Build the base root inode
@@ -952,8 +815,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
 		dv->sdev_path = NULL;
 	}
 
-	if (!SDEV_IS_GLOBAL(dv))
+	if (!SDEV_IS_GLOBAL(dv)) {
 		sdev_prof_free(dv);
+		if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL)
+			SDEV_RELE(dv->sdev_origin);
+	}
 
 	if (SDEVTOV(dv)->v_type == VDIR) {
 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
@@ -967,6 +833,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
 	(void) memset((void *)&dv->sdev_instance_data, 0,
 	    sizeof (dv->sdev_instance_data));
 	vn_invalid(SDEVTOV(dv));
+	dv->sdev_private = NULL;
 	kmem_cache_free(sdev_node_cache, dv);
 }
 
@@ -2948,46 +2815,6 @@ sdev_modctl_devexists(const char *path)
 	return (error);
 }
 
-extern int sdev_vnodeops_tbl_size;
-
-/*
- * construct a new template with overrides from vtab
- */
-static fs_operation_def_t *
-sdev_merge_vtab(const fs_operation_def_t tab[])
-{
-	fs_operation_def_t *new;
-	const fs_operation_def_t *tab_entry;
-
-	/* make a copy of standard vnode ops table */
-	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
-	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
-
-	/* replace the overrides from tab */
-	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
-		fs_operation_def_t *std_entry = new;
-		while (std_entry->name) {
-			if (strcmp(tab_entry->name, std_entry->name) == 0) {
-				std_entry->func = tab_entry->func;
-				break;
-			}
-			std_entry++;
-		}
-		if (std_entry->name == NULL)
-			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
-			    tab_entry->name);
-	}
-
-	return (new);
-}
-
-/* free memory allocated by sdev_merge_vtab */
-static void
-sdev_free_vtab(fs_operation_def_t *new)
-{
-	kmem_free(new, sdev_vnodeops_tbl_size);
-}
-
 /*
  * a generic setattr() function
  *
diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c
index ea9cb6374a..6f32f47635 100644
--- a/usr/src/uts/common/fs/dev/sdev_vfsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c
@@ -169,7 +169,13 @@ devinit(int fstype, char *name)
 
 	if ((devmajor = getudev()) == (major_t)-1) {
 		cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name);
-		return (1);
+		return (ENXIO);
+	}
+
+	if (sdev_plugin_init() != 0) {
+		cmn_err(CE_WARN, "%s: failed to set init plugin subsystem",
+		    sdev_vfssw.name);
+		return (EIO);
 	}
 
 	/* initialize negative cache */
@@ -332,6 +338,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
 		ASSERT(sdev_origins);
 		dv->sdev_flags &= ~SDEV_GLOBAL;
 		dv->sdev_origin = sdev_origins->sdev_root;
+		SDEV_HOLD(dv->sdev_origin);
 	} else {
 		sdev_ncache_setup();
 		rw_enter(&dv->sdev_contents, RW_WRITER);
@@ -504,3 +511,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo)
 	SDEVTOV(mntinfo->sdev_root)->v_count--;
 	mutex_exit(&sdev_lock);
 }
+
+void
+sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg)
+{
+	struct sdev_data *mntinfo;
+
+	mutex_enter(&sdev_lock);
+	mntinfo = sdev_mntinfo;
+	while (mntinfo != NULL) {
+		func(mntinfo->sdev_root, arg);
+		mntinfo = mntinfo->sdev_next;
+	}
+	mutex_exit(&sdev_lock);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index e4c3acf787..495cf450de 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2013 Joyent, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
  */
 
 /* vnode ops for the /dev/zvol directory */
@@ -370,8 +370,10 @@ devzvol_create_pool_dirs(struct vnode *dvp)
 		ASSERT(dvp->v_count > 0);
 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
 		    NULL, kcred, NULL, 0, NULL);
-		/* should either work, or not be visible from a zone */
-		ASSERT(rc == 0 || rc == ENOENT);
+		/*
+		 * should either work or we should get an error if this should
+		 * not be visible from the zone, or disallowed in the zone
+		 */
 		if (rc == 0)
 			VN_RELE(vp);
 		pools++;
diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c
index 25327d2852..c949117da6 100644
--- a/usr/src/uts/common/fs/dnlc.c
+++ b/usr/src/uts/common/fs/dnlc.c
@@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop)
 }
 
 /*
- * Perform a reverse lookup in the DNLC.  This will find the first occurrence of
- * the vnode.  If successful, it will return the vnode of the parent, and the
- * name of the entry in the given buffer.  If it cannot be found, or the buffer
- * is too small, then it will return NULL.  Note that this is a highly
- * inefficient function, since the DNLC is constructed solely for forward
- * lookups.
- */
-vnode_t *
-dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
-{
-	nc_hash_t *nch;
-	ncache_t *ncp;
-	vnode_t *pvp;
-
-	if (!doingcache)
-		return (NULL);
-
-	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
-		mutex_enter(&nch->hash_lock);
-		ncp = nch->hash_next;
-		while (ncp != (ncache_t *)nch) {
-			/*
-			 * We ignore '..' entries since it can create
-			 * confusion and infinite loops.
-			 */
-			if (ncp->vp == vp && !(ncp->namlen == 2 &&
-			    0 == bcmp(ncp->name, "..", 2)) &&
-			    ncp->namlen < buflen) {
-				bcopy(ncp->name, buf, ncp->namlen);
-				buf[ncp->namlen] = '\0';
-				pvp = ncp->dp;
-				/* VN_HOLD 2 of 2 in this file */
-				VN_HOLD_CALLER(pvp);
-				mutex_exit(&nch->hash_lock);
-				return (pvp);
-			}
-			ncp = ncp->hash_next;
-		}
-		mutex_exit(&nch->hash_lock);
-	}
-
-	return (NULL);
-}
-/*
  * Utility routine to search for a cache entry. Return the
  * ncache entry if found, NULL otherwise.
  */
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index b4e28cc860..5f524def30 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/atomic.h>
 #include <sys/kmem.h>
@@ -33,11 +37,12 @@
 #include <sys/systm.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
-
 #include <sys/fem.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/vfs_opreg.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
 
 #define	NNODES_DEFAULT	8	/* Default number of nodes in a fem_list */
 /*
@@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1)
 }
 #endif
 
+/*
+ * File event monitoring handoffs
+ *
+ * File event monitoring relies on being able to inject stack frames between
+ * vnode consumers and the underlying file systems.  This becomes problematic
+ * when there exist many monitors, as kernel stack depth is finite.  The model
+ * very much encodes this injected frame:  the flow of control deliberately
+ * lies with the monitor, not with the monitoring system.  While we could
+ * conceivably address this by allowing each subsystem to install at most
+ * one monitor per vnode (and impose on subsystems that they handle any
+ * of their own consumer multiplexing internally), this in fact exports a
+ * substantial amount of run-time complexity to deal with an uncommon case
+ * (and, it must be said, assumes a small number of consuming subsystems).
+ * To allow our abstraction to remain clean, we instead check our remaining
+ * stack in every vnext_*() call; if the amount of stack remaining is lower
+ * than a threshold (fem_stack_needed), we call thread_splitstack() to carry
+ * on the execution of the monitors and the underlying vnode operation on a
+ * split stack.  Because we can only pass a single argument to our split stack
+ * function, we must marshal our arguments, the mechanics of which are somewhat
+ * ornate in terms of the code: to marshal in a type-safe manner, we define a
+ * baton that is a union of payload structures for each kind of operation,
+ * loading the per-operation payload explicitly and calling into common handoff
+ * code that itself calls thread_splitstack().  The function passed to
+ * thread_splitstack() is a per-entry point function that continues monitor
+ * processing given the specified (marshalled) arguments.  While this method
+ * is a little verbose to implement, it has the advantage of being relatively
+ * robust (that is, broadly type-safe) while imposing minimal burden on each
+ * vnext_*() entry point.
+ *
+ * In terms of the implementation:
+ *
+ * - The FEM_BATON_n macros define the per-entry point baton structures
+ * - The fem_baton_payload_t contains the union of these structures
+ * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point
+ * - The FEM_VNEXTn macros constitute the per-handoff entry point
+ *
+ * Note that we don't use variadic macros -- we define a variant of these
+ * macros for each of our relevant argument counts.  This may seem overly
+ * explicit, but it is deliberate:  the object here is to minimize the
+ * future maintenance burden by minimizing the likelihood of introduced
+ * error --  not to minimize the number of characters in this source file.
+ */
+
+#ifndef STACK_GROWTH_DOWN
+#error Downward stack growth assumed.
+#endif
+
+int fem_stack_toodeep;
+uintptr_t fem_stack_needed = 8 * 1024;
+size_t fem_handoff_stacksize = 128 * 1024;
+
+#define	FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
+	(uintptr_t)curthread->t_stkbase < fem_stack_needed)
+
+#define	FEM_BATON_1(what, t0, l0)					\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+	} fb_##what
+
+#define	FEM_BATON_2(what, t0, l0, t1, l1)				\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+	} fb_##what
+
+#define	FEM_BATON_3(what, t0, l0, t1, l1, t2, l2)			\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+	} fb_##what
+
+#define	FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3)		\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+	} fb_##what
+
+#define	FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4)	\
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+	} fb_##what
+
+#define	FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+	} fb_##what
+
+#define	FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+	} fb_##what
+
+#define	FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+    t6, l6, t7, l7, t8, l8) \
+	struct {							\
+		void *fb_##what##_arg0;					\
+		caller_context_t *fb_##what##_ct;			\
+		t0 fb_##what##_##l0;					\
+		t1 fb_##what##_##l1;					\
+		t2 fb_##what##_##l2;					\
+		t3 fb_##what##_##l3;					\
+		t4 fb_##what##_##l4;					\
+		t5 fb_##what##_##l5;					\
+		t6 fb_##what##_##l6;					\
+		t7 fb_##what##_##l7;					\
+		t8 fb_##what##_##l8;					\
+	} fb_##what
+
+typedef union {
+	FEM_BATON_2(open, int, mode, cred_t *, cr);
+	FEM_BATON_4(close, int, flag, int, count,
+	    offset_t, offset, cred_t *, cr);
+	FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr);
+	FEM_BATON_5(ioctl, int, cmd, intptr_t, arg,
+	    int, flag, cred_t *, cr, int *, rvalp);
+	FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr);
+	FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr);
+	FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr);
+	FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp,
+	    pathname_t *, pnp, int, flags, vnode_t *, rdir,
+	    cred_t *, cr, int *, direntflags, pathname_t *, realpnp);
+	FEM_BATON_8(create, char *, name, vattr_t *, vap,
+	    vcexcl_t, excl, int, mode, vnode_t **, vpp,
+	    cred_t *, cr, int, flag, vsecattr_t *, vsecp);
+	FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags);
+	FEM_BATON_4(link, vnode_t *, svp, char *, tnm,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp,
+	    char *, tnm, cred_t *, cr, int, flags);
+	FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap,
+	    vnode_t **, vpp, cred_t *, cr, int, flags,
+	    vsecattr_t *, vsecp);
+	FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir,
+	    cred_t *, cr, int, flags);
+	FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr,
+	    int *, eofp, int, flags);
+	FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap,
+	    char *, target, cred_t *, cr, int, flags);
+	FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr);
+	FEM_BATON_2(fsync, int, syncflag, cred_t *, cr);
+	FEM_BATON_1(inactive, cred_t *, cr);
+	FEM_BATON_1(fid, fid_t *, fidp);
+	FEM_BATON_1(rwlock, int, write_lock);
+	FEM_BATON_1(rwunlock, int, write_lock);
+	FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp);
+	FEM_BATON_1(cmp, vnode_t *, vp2);
+	FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, struct flk_callback *, flk_cbp,
+	    cred_t *, cr);
+	FEM_BATON_5(space, int, cmd, struct flock64 *, bfp,
+	    int, flag, offset_t, offset, cred_t *, cr);
+	FEM_BATON_1(realvp, vnode_t **, vpp);
+	FEM_BATON_9(getpage, offset_t, off, size_t, len,
+	    uint_t *, protp, struct page **, plarr, size_t, plsz,
+	    struct seg *, seg, caddr_t, addr, enum seg_rw, rw,
+	    cred_t *, cr);
+	FEM_BATON_4(putpage, offset_t, off, size_t, len,
+	    int, flags, cred_t *, cr);
+	FEM_BATON_8(map, offset_t, off, struct as *, as,
+	    caddr_t *, addrp, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(addmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uchar_t, prot,
+	    uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_8(delmap, offset_t, off, struct as *, as,
+	    caddr_t, addr, size_t, len, uint_t, prot,
+	    uint_t, maxprot, uint_t, flags, cred_t *, cr);
+	FEM_BATON_4(poll, short, events, int, anyyet,
+	    short *, reventsp, struct pollhead **, phpp);
+	FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks);
+	FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr);
+	FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off,
+	    size_t, io_len, int, flags, cred_t *, cr);
+	FEM_BATON_2(dumpctl, int, action, offset_t *, blkp);
+	FEM_BATON_4(dispose, struct page *, pp, int, flag,
+	    int, dn, cred_t *, cr);
+	FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+	FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr,
+	    int, flag, cred_t *, cr);
+	FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname);
+	FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag,
+	    xuio_t *, xuiop, cred_t *, cr);
+	FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr);
+} fem_baton_payload_t;
+
+typedef struct {
+	fem_baton_payload_t fb_payload;
+	int (*fb_func)();
+	void (*fb_handoff)();
+	int fb_rval;
+} fem_baton_t;
+
+static int
+fem_handoff(fem_baton_t *bp)
+{
+	fem_stack_toodeep++;
+	thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize);
+
+	return (bp->fb_rval);
+}
+
+#define	FEM_VNEXT3_DECL(what, a0, a1, a2)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2);			\
+}
+
+#define	FEM_VNEXT4_DECL(what, a0, a1, a2, a3)				\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3);			\
+}
+
+#define	FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4);			\
+}
+
+#define	FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5)			\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5);			\
+}
+
+#define	FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6);			\
+}
+
+#define	FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7)		\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7);			\
+}
+
+#define	FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9);			\
+}
+
+#define	FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+void									\
+fem_handoff_##what(fem_baton_t *bp)					\
+{									\
+	bp->fb_rval = bp->fb_func(					\
+	    bp->fb_payload.fb_##what.fb_##what##_##a0,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a1,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a2,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a3,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a4,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a5,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a6,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a7,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a8,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a9,			\
+	    bp->fb_payload.fb_##what.fb_##what##_##a10);		\
+}
+
+#define	FEM_VNEXT3(what, func, a0, a1, a2)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2))
+
+#define	FEM_VNEXT4(what, func, a0, a1, a2, a3)				\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3))
+
+#define	FEM_VNEXT5(what, func, a0, a1, a2, a3, a4)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4))
+
+#define	FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5)			\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5))
+
+#define	FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6))
+
+#define	FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7)		\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7))
+
+#define	FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)	\
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9))
+
+#define	FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+	if (FEM_TOODEEP()) {						\
+		fem_baton_t *baton;					\
+		int rval;						\
+									\
+		baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP);	\
+		baton->fb_payload.fb_##what.fb_##what##_##a0 = a0;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a1 = a1;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a2 = a2;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a3 = a3;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a4 = a4;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a5 = a5;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a6 = a6;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a7 = a7;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a8 = a8;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a9 = a9;	\
+		baton->fb_payload.fb_##what.fb_##what##_##a10 = a10;	\
+		baton->fb_handoff = fem_handoff_##what;			\
+		baton->fb_func = func;					\
+									\
+		rval = fem_handoff(baton);				\
+		kmem_free(baton, sizeof (fem_baton_t));			\
+									\
+		return (rval);						\
+	}								\
+	return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10))
+
 static fem_t *
 fem_alloc()
 {
@@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[]  = {
  * 5.  Return by invoking the base operation with the base object.
  *
  * for each classification, there needs to be at least one "next" operation
- * for each "head"operation.
- *
+ * for each "head" operation.  Note that we also use the FEM_VNEXTn_DECL macros
+ * to define the function to run when the stack is split; see the discussion
+ * on "File event monitoring handoffs", above.
  */
 
+FEM_VNEXT4_DECL(open, arg0, mode, cr, ct)
+FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct)
+FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct)
+FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct)
+FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct)
+FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir,
+    cr, ct, direntflags, realpnp)
+FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)
+FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags)
+FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags)
+FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags)
+FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp)
+FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags)
+FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags)
+FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags)
+FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct)
+FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct)
+FEM_VNEXT3_DECL(fid, arg0, fidp, ct)
+FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct)
+FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct)
+FEM_VNEXT3_DECL(cmp, arg0, vp2, ct)
+FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)
+FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct)
+FEM_VNEXT3_DECL(realvp, arg0, vpp, ct)
+FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz,
+    seg, addr, rw, cr, ct)
+FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct)
+FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot,
+    flags, cr, ct)
+FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct)
+FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct)
+FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct)
+FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct)
+FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct)
+FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct)
+FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct)
+FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct)
+FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct)
+
 int
 vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 {
@@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_open, femop_open);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, cr, ct));
+	FEM_VNEXT4(open, func, arg0, mode, cr, ct);
 }
 
 int
@@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_close, femop_close);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, flag, count, offset, cr, ct));
+	FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct);
 }
 
 int
@@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_read, femop_read);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_write, femop_write);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, ioflag, cr, ct));
+	FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct);
 }
 
 int
@@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct));
+	FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct);
 }
 
 int
@@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, oflags, nflags, cr, ct));
+	FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct);
 }
 
 int
@@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vap, flags, cr, ct));
+	FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct);
 }
 
 int
@@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_access, femop_access);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, mode, flags, cr, ct));
+	FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct);
 }
 
 int
@@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp,
 	vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct,
-	    direntflags, realpnp));
+	FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct,
+	    direntflags, realpnp);
 }
 
 int
@@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
 	vsop_find(vf, &func, int, &arg0, vop_create, femop_create);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp));
+	FEM_VNEXT10(create, func, arg0, name, vap, excl,
+	    mode, vpp, cr, flag, ct, vsecp);
 }
 
 int
@@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
 	vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cr, ct, flags));
+	FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags);
 }
 
 int
@@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_link, femop_link);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, svp, tnm, cr, ct, flags));
+	FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags);
 }
 
 int
@@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags));
+	FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags);
 }
 
 int
@@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp,
 	vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp));
+	FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp);
 }
 
 int
@@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, nm, cdir, cr, ct, flags));
+	FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags);
 }
 
 int
@@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
 	vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, eofp, ct, flags));
+	FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags);
 }
 
 int
@@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target,
 	vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, linkname, vap, target, cr, ct, flags));
+	FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags);
 }
 
 int
@@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, uiop, cr, ct));
+	FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct);
 }
 
 int
@@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, syncflag, cr, ct));
+	FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct);
 }
 
 void
@@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, fidp, ct));
+	FEM_VNEXT3(fid, func, arg0, fidp, ct);
 }
 
 int
@@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, write_lock, ct));
+	FEM_VNEXT3(rwlock, func, arg0, write_lock, ct);
 }
 
 void
@@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ooff, noffp, ct));
+	FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct);
 }
 
 int
@@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vp2, ct));
+	FEM_VNEXT3(cmp, func, arg0, vp2, ct);
 }
 
 int
@@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+	FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 }
 
 int
@@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_space, femop_space);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct));
+	FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct);
 }
 
 int
@@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vpp, ct));
+	FEM_VNEXT3(realvp, func, arg0, vpp, ct);
 }
 
 int
@@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp,
 	vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw,
-	    cr, ct));
+	FEM_VNEXT11(getpage, func, arg0, off, len, protp,
+	    plarr, plsz, seg, addr, rw, cr, ct);
 }
 
 int
@@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags,
 	vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, len, flags, cr, ct));
+	FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct);
 }
 
 int
@@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp,
 	vsop_find(vf, &func, int, &arg0, vop_map, femop_map);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct);
 }
 
 int
@@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
 	vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
-	    cr, ct));
+	FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot,
+	    flags, cr, ct);
 }
 
 int
@@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp,
 	vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, events, anyyet, reventsp, phpp, ct));
+	FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct);
 }
 
 int
@@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks,
 	vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, addr, lbdn, dblks, ct));
+	FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct);
 }
 
 int
@@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, valp, cr, ct));
+	FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct);
 }
 
 int
@@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off,
 	vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct));
+	FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct);
 }
 
 int
@@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, action, blkp, ct));
+	FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct);
 }
 
 void
@@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vsap, flag, cr, ct));
+	FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct);
 }
 
 int
@@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag,
 	vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, cmd, shr, flag, cr, ct));
+	FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct);
 }
 
 int
@@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
 	vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, vnevent, dvp, cname, ct));
+	FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct);
 }
 
 int
@@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
 	vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, ioflag, xuiop, cr, ct));
+	FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct);
 }
 
 int
@@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
 	vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
 	ASSERT(func != NULL);
 	ASSERT(arg0 != NULL);
-	return ((*func)(arg0, xuiop, cr, ct));
+	FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct);
 }
 
 int
diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c
index 6e56000ffe..56204c6741 100644
--- a/usr/src/uts/common/fs/fifofs/fifosubr.c
+++ b/usr/src/uts/common/fs/fifofs/fifosubr.c
@@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld)
 	/*
 	 * The other end of the pipe is almost closed so
 	 * reject any other open on this end of the pipe
-	 * This only happens with a pipe mounted under namefs
+	 * This normally only happens with a pipe mounted under namefs, but
+	 * we can also see an open via proc/fd, which should still succeed.
+	 * To indicate the proc/fd case the FKLYR flag is passed.
 	 */
-	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) {
+	if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) &&
+	    (flag & FKLYR) == 0) {
 		fifo_cleanup(oldvp, flag);
 		cv_broadcast(&fnp->fn_wait_cv);
 		if (!lockheld)
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index ac89e430c7..fee2924093 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -27,7 +27,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ */
 
 /*
  * FIFOFS file system vnode operations.  This file system
@@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
 	}
 
 	/*
-	 * if we happened to get something, return
+	 * if we happened to get something and we're not edge-triggered, return
 	 */
-
-	if ((*reventsp = (short)retevents) != 0) {
+	if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) {
 		mutex_exit(&fnp->fn_lock->flk_lock);
 		return (0);
 	}
 
 	/*
-	 * If poll() has not found any events yet, set up event cell
-	 * to wake up the poll if a requested event occurs on this
+	 * If poll() has not found any events yet or we're edge-triggered, set
+	 * up event cell to wake up the poll if a requested event occurs on this
 	 * pipe/fifo.
 	 */
 	if (!anyyet) {
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 744d269716..ec5b145a86 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -25,6 +25,7 @@
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -246,6 +247,7 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 	int frcmd;
 	int nlmid;
 	int error = 0;
+	boolean_t skip_lock = B_FALSE;
 	flk_callback_t serialize_callback;
 	int serialize = 0;
 	v_mode_t mode;
@@ -265,6 +267,17 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 		}
 		break;
 
+	case F_OFD_GETLK:
+		/*
+		 * TBD we do not support remote OFD locks at this time.
+		 */
+		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
+			error = EINVAL;
+			goto done;
+		}
+		skip_lock = B_TRUE;
+		break;
+
 	case F_SETLK_NBMAND:
 		/*
 		 * Are NBMAND locks allowed on this file?
@@ -326,6 +339,20 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 		}
 		break;
 
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+	case F_FLOCK:
+	case F_FLOCKW:
+		/*
+		 * TBD we do not support remote OFD locks at this time.
+		 */
+		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
+			error = EINVAL;
+			goto done;
+		}
+		skip_lock = B_TRUE;
+		break;
+
 	case F_HASREMOTELOCKS:
 		nlmid = GETNLMID(bfp->l_sysid);
 		if (nlmid != 0) {	/* booted as a cluster */
@@ -354,7 +381,8 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
 		flk_cbp = &serialize_callback;
 	}
 
-	error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
+	if (!skip_lock)
+		error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
 
 done:
 	if (serialize)
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..05ee2c6e09
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+		vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define	HL_HASH_SIZE	8192		/* must be power of 2 */
+#define	HL_MUTEX_SIZE	64
+
+static hldirent_t	*hl_hashtable[HL_HASH_SIZE];
+static kmutex_t		 hl_hashmutex[HL_MUTEX_SIZE];
+
+#define	HL_HASH_INDEX(a)	((a) & (HL_HASH_SIZE-1))
+#define	HL_MUTEX_INDEX(a)	((a) & (HL_MUTEX_SIZE-1))
+
+#define	HYPRLOFS_HASH(tp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+void
+hyprlofs_hash_init(void)
+{
+	int	ix;
+
+	for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+		mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+	h->hld_hash = hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	h->hld_link = *prevpp;
+	*prevpp = h;
+	mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = h->hld_hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	while (*prevpp != h)
+		prevpp = &(*prevpp)->hld_link;
+	*prevpp = h->hld_link;
+	mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+    hlnode_t **found)
+{
+	hldirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+	hlnode_t	*hnp;
+
+	HYPRLOFS_HASH(parent, name, hash);
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	l = hl_hashtable[HL_HASH_INDEX(hash)];
+	while (l) {
+		if (l->hld_hash == hash && l->hld_parent == parent &&
+		    strcmp(l->hld_name, name) == 0) {
+			/*
+			 * Ensure that the hlnode that we put a hold on is the
+			 * same one that we pass back. Thus the temp. var
+			 * hnp is necessary.
+			 */
+			hnp = l->hld_hlnode;
+			if (hold) {
+				ASSERT(hnp);
+				hlnode_hold(hnp);
+			}
+			if (found)
+				*found = hnp;
+			mutex_exit(hmtx);
+			return (l);
+		} else {
+			l = l->hld_link;
+		}
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+	int error;
+
+	*foundtp = NULL;
+	if (parent->hln_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		hlnode_hold(parent);
+		*foundtp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name. We need the lock
+	 * protecting the hln_dir list so that it doesn't change out from
+	 * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+	 * with a hold on it.
+	 */
+	if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+		ASSERT(*foundtp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+	hlfsmount_t	*hm,
+	hlnode_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	vnode_t		*realvp,	/* real vnode */
+	vattr_t		*va,
+	hlnode_t	**hpp,		/* return hlnode */
+	cred_t		*cr)
+{
+	hldirent_t *hdp;
+	hlnode_t *found = NULL;
+	hlnode_t *hp;
+	int error = 0;
+	char *s;
+
+	/* hln_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	/* Don't allow '/' characters in pathname component */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("hyprlofs_direnter: NULL name");
+
+	/*
+	 * This might be a "dangling detached directory". It could have been
+	 * removed, but a reference to it kept in u_cwd. Don't bother searching
+	 * it, and with any luck the user will get tired of dealing with us and
+	 * cd to some absolute pathway. This is in ufs, too.
+	 */
+	if (dir->hln_nlink == 0) {
+		return (ENOENT);
+	}
+
+	/* Search for the entry.  Return "found" if it exists. */
+	hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+	if (hdp) {
+		ASSERT(found);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (hpp) {
+				*hpp = found;
+				error = EEXIST;
+			} else {
+				hlnode_rele(found);
+			}
+			break;
+		}
+	} else {
+
+		/*
+		 * The entry does not exist. Check write perms in dir to see if
+		 * entry can be created.
+		 */
+		if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+			return (error);
+
+		/* Make new hlnode and directory entry as required. */
+		if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+		    cr)))
+			return (error);
+
+		if ((error = hldiraddentry(dir, hp, name))) {
+			/* Unmake the inode we just made. */
+			rw_enter(&hp->hln_rwlock, RW_WRITER);
+			if ((hp->hln_type) == VDIR) {
+				ASSERT(hdp == NULL);
+				/* cleanup allocs made by hyprlofs_dirinit() */
+				hyprlofs_dirtrunc(hp);
+			}
+			mutex_enter(&hp->hln_tlock);
+			hp->hln_nlink = 0;
+			mutex_exit(&hp->hln_tlock);
+			gethrestime(&hp->hln_ctime);
+			rw_exit(&hp->hln_rwlock);
+			hlnode_rele(hp);
+			hp = NULL;
+		} else if (hpp) {
+			*hpp = hp;
+		} else {
+			hlnode_rele(hp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+    cred_t *cr)
+{
+	hldirent_t *hpdp;
+	int error;
+	size_t namelen;
+	hlnode_t *hnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+	/* return error if removing . or .. */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+		return (error);
+
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+	if (hpdp == NULL) {
+		/*
+		 * If it is gone, some other thread got here first!
+		 * Return error ENOENT.
+		 */
+		return (ENOENT);
+	}
+
+	/*
+	 * If the hlnode in the hldirent changed (shouldn't happen since we
+	 * don't support rename) then original is gone, so return that status
+	 * (same as UFS).
+	 */
+	if (hp != hnp)
+		return (ENOENT);
+
+	hyprlofs_hash_out(hpdp);
+
+	/* Take hpdp out of the directory list. */
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+	if (hpdp->hld_prev) {
+		hpdp->hld_prev->hld_next = hpdp->hld_next;
+	}
+	if (hpdp->hld_next) {
+		hpdp->hld_next->hld_prev = hpdp->hld_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match hpdp, point it at the
+	 * previous dirent.
+	 */
+	if (dir->hln_dir->hld_prev == hpdp) {
+		dir->hln_dir->hld_prev = hpdp->hld_prev;
+	}
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	/* hpdp points to the correct directory entry */
+	namelen = strlen(hpdp->hld_name) + 1;
+
+	kmem_free(hpdp, sizeof (hldirent_t) + namelen);
+	dir->hln_size -= (sizeof (hldirent_t) + namelen);
+	dir->hln_dirents--;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+	hp->hln_ctime = now;
+
+	ASSERT(hp->hln_nlink > 0);
+	DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+	if (op == DR_RMDIR && hp->hln_type == VDIR) {
+		hyprlofs_dirtrunc(hp);
+		ASSERT(hp->hln_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+	hlnode_t *parent,	/* parent of directory to initialize */
+	hlnode_t *dir)		/* the new directory */
+{
+	hldirent_t *dot, *dotdot;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP);
+	dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP);
+
+	/* Initialize the entries */
+	dot->hld_hlnode = dir;
+	dot->hld_offset = 0;
+	dot->hld_name = (char *)dot + sizeof (hldirent_t);
+	dot->hld_name[0] = '.';
+	dot->hld_parent = dir;
+	hyprlofs_hash_in(dot);
+
+	dotdot->hld_hlnode = parent;
+	dotdot->hld_offset = 1;
+	dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+	dotdot->hld_name[0] = '.';
+	dotdot->hld_name[1] = '.';
+	dotdot->hld_parent = dir;
+	hyprlofs_hash_in(dotdot);
+
+	/* Initialize directory entry list. */
+	dot->hld_next = dotdot;
+	dot->hld_prev = dotdot;
+	dotdot->hld_next = NULL;
+	dotdot->hld_prev = dot;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	/*
+	 * Since hyprlofs_dirinit is called with both dir and parent being the
+	 * same for the root vnode, we need to increment this before we set
+	 * hln_nlink = 2 below.
+	 */
+	INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+	parent->hln_ctime = now;
+
+	dir->hln_dir = dot;
+	dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+	dir->hln_dirents = 2;
+	dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+	hldirent_t *hdp;
+	hlnode_t *tp;
+	size_t namelen;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (dir->hln_looped)
+		return;
+
+	for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+		ASSERT(hdp->hld_next != hdp);
+		ASSERT(hdp->hld_prev != hdp);
+		ASSERT(hdp->hld_hlnode);
+
+		dir->hln_dir = hdp->hld_next;
+		namelen = strlen(hdp->hld_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this dir entry removal.
+		 */
+		tp = hdp->hld_hlnode;
+
+		ASSERT(tp->hln_nlink > 0);
+		DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+		hyprlofs_hash_out(hdp);
+
+		kmem_free(hdp, sizeof (hldirent_t) + namelen);
+		dir->hln_size -= (sizeof (hldirent_t) + namelen);
+		dir->hln_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	ASSERT(dir->hln_dir == NULL);
+	ASSERT(dir->hln_size == 0);
+	ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+    hlnode_t	*dir,	/* target directory to make entry in */
+    hlnode_t	*hp,	/* new hlnode */
+    char	*name)
+{
+	hldirent_t	*hdp, *hpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent dir wasn't removed from underneath the caller.
+	 */
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same FS. */
+	if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Alloc and init dir entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (hldirent_t);
+	hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP);
+	if (hdp == NULL)
+		return (ENOSPC);
+
+	dir->hln_size += alloc_size;
+	dir->hln_dirents++;
+	hdp->hld_hlnode = hp;
+	hdp->hld_parent = dir;
+
+	/* The dir entry and its name were allocated sequentially. */
+	hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+	(void) strcpy(hdp->hld_name, name);
+
+	hyprlofs_hash_in(hdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain fairly
+	 * static.  For example, a routine which unlinks files between calls to
+	 * readdir(); the size of the dir changes from underneath it and so the
+	 * real dir offset in bytes is invalid.  To circumvent this problem, we
+	 * initialize a dir entry with a phony offset, and use this offset to
+	 * determine end of file in hyprlofs_readdir.
+	 */
+	hpdp = dir->hln_dir->hld_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+	    hpdp->hld_offset) <= 1) {
+		ASSERT(hpdp->hld_next != hpdp);
+		ASSERT(hpdp->hld_prev != hpdp);
+		ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+		hpdp = hpdp->hld_next;
+	}
+	hdp->hld_offset = hpdp->hld_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which is
+	 * necessarily the largest offset in this dir) is more than twice the
+	 * number of dirents, that means the dir is 50% holes.  At this point
+	 * we reset the slot pointer back to the beginning of the dir so we
+	 * start using the holes. The idea is that if there are N dirents,
+	 * there must also be N holes, so we can satisfy the next N creates by
+	 * walking at most 2N entries; thus the average cost of a create is
+	 * constant. Note that we use the first dirent's hld_prev as the roving
+	 * slot pointer. This saves a word in every dirent.
+	 */
+	if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+		dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+	else
+		dir->hln_dir->hld_prev = hdp;
+
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	hdp->hld_next = hpdp->hld_next;
+	if (hdp->hld_next) {
+		hdp->hld_next->hld_prev = hdp;
+	}
+	hdp->hld_prev = hpdp;
+	hpdp->hld_next = hdp;
+
+	ASSERT(hdp->hld_next != hdp);
+	ASSERT(hdp->hld_prev != hdp);
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+    vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+	hlnode_t	*hp;
+	enum vtype	type;
+
+	ASSERT(va != NULL);
+	ASSERT(op == DE_CREATE || op == DE_MKDIR);
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+	type = va->va_type;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, va, cr);
+
+	hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+	hp->hln_vnode->v_type = type;
+	hp->hln_uid = crgetuid(cr);
+
+	/*
+	 * To determine the gid of the created file:
+	 *   If the directory's set-gid bit is set, set the gid to the gid
+	 *   of the parent dir, otherwise, use the process's gid.
+	 */
+	if (dir->hln_mode & VSGID)
+		hp->hln_gid = dir->hln_gid;
+	else
+		hp->hln_gid = crgetgid(cr);
+
+	/*
+	 * If we're creating a dir and the parent dir has the set-GID bit set,
+	 * set it on the new dir. Otherwise, if the user is neither privileged
+	 * nor a member of the file's new group, clear the file's set-GID bit.
+	 */
+	if (dir->hln_mode & VSGID && type == VDIR)
+		hp->hln_mode |= VSGID;
+	else {
+		if ((hp->hln_mode & VSGID) &&
+		    secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+			hp->hln_mode &= ~VSGID;
+	}
+
+	if (va->va_mask & AT_ATIME)
+		hp->hln_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		hp->hln_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		hyprlofs_dirinit(dir, hp);
+		hp->hln_looped = 0;
+	} else {
+		hp->hln_realvp = realvp;
+		hp->hln_size = va->va_size;
+		hp->hln_looped = 1;
+	}
+
+	*newnode = hp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..1d857309f3
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define	MODESHIFT	3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+	vnode_t *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+	h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	h->hln_mask = 0;
+	h->hln_type = vap->va_type;
+	h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+	h->hln_nlink = 1;
+	h->hln_size = 0;
+
+	if (cr == NULL) {
+		h->hln_uid = vap->va_uid;
+		h->hln_gid = vap->va_gid;
+	} else {
+		h->hln_uid = crgetuid(cr);
+		h->hln_gid = crgetgid(cr);
+	}
+
+	h->hln_fsid = hm->hlm_dev;
+	h->hln_rdev = vap->va_rdev;
+	h->hln_blksize = PAGESIZE;
+	h->hln_nblocks = 0;
+	gethrestime(&now);
+	h->hln_atime = now;
+	h->hln_mtime = now;
+	h->hln_ctime = now;
+	h->hln_seq = 0;
+	h->hln_dir = NULL;
+
+	h->hln_vnode = vn_alloc(KM_SLEEP);
+	vp = HLNTOV(h);
+	vn_setops(vp, hyprlofs_vnodeops);
+	vp->v_vfsp = hm->hlm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)h;
+	mutex_enter(&hm->hlm_contents);
+	/*
+	 * Increment the pseudo generation number for this hlnode. Since
+	 * hlnodes are allocated and freed, there really is no particular
+	 * generation number for a new hlnode.  Just fake it by using a
+	 * counter in each file system.
+	 */
+	h->hln_gen = hm->hlm_gen++;
+
+	/*
+	 * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+	 * Root dir is handled specially in hyprlofs_mount.
+	 */
+	if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+		h->hln_forw = NULL;
+		h->hln_back = hm->hlm_rootnode->hln_back;
+		h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+	}
+	mutex_exit(&hm->hlm_contents);
+	vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+	hlnode_t *hp = vtp;
+	int shift = 0;
+
+	/* Check access based on owner, group and public perms in hlnode. */
+	if (crgetuid(cr) != hp->hln_uid) {
+		shift += MODESHIFT;
+		if (groupmember(hp->hln_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+	    hp->hln_mode << shift, mode));
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..c582a8cac2
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,614 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems.  It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute.  Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses.  Entries in the name space are not mounts and
+ * thus do not appear in the mnttab.  Entries in the name space are allowed to
+ * refer to files on different backing file systems.  Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs.  There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications.  That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place.  It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space.  The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed.  The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /.  The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"hyprlofs",
+	hyprlofsinit,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+	&hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+	0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(hyprlofsfstype);
+	vn_freevnodeops(hyprlofs_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace;	/* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t	hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+	static const fs_operation_def_t hl_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = hyprlofs_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = hyprlofs_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = hyprlofs_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = hyprlofs_statvfs },
+		VFSNAME_VGET,		{ .vfs_vget = hyprlofs_vget },
+		NULL,			NULL
+	};
+	int error;
+	extern  void    hyprlofs_hash_init();
+
+	hyprlofs_hash_init();
+	hyprlofsfstype = fstype;
+	ASSERT(hyprlofsfstype != 0);
+
+	error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, hyprlofs_vnodeops_template,
+	    &hyprlofs_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * hyprlofs_minfree is an absolute limit of swap space which still
+	 * allows other processes to execute.  Set it if its not patched.
+	 */
+	if (hyprlofs_minfree == 0)
+		hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+	if ((hyprlofs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "hyprlofsinit: Can't get unique device number.");
+		hyprlofs_major = 0;
+	}
+	mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	hlfsmount_t *hm = NULL;
+	hlnode_t *hp;
+	struct pathname dpn;
+	int error;
+	vattr_t rattr;
+	int got_attrs;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* Having the resource be anything but "swap" doesn't make sense. */
+	vfs_setresource(vfsp, "swap", 0);
+
+	if ((error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+	    &dpn)) != 0)
+		goto out;
+
+	if ((hm = kmem_zalloc(sizeof (hlfsmount_t),
+	    KM_NORMALPRI | KM_NOSLEEP)) == NULL) {
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Get an available minor device number for this mount */
+	mutex_enter(&hyprlofs_minor_lock);
+	do {
+		hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+		hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+	} while (vfs_devismounted(hm->hlm_dev));
+	mutex_exit(&hyprlofs_minor_lock);
+
+	/*
+	 * Set but don't bother entering the mutex since hlfsmount is not on
+	 * the mount list yet.
+	 */
+	mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	hm->hlm_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)hm;
+	vfsp->vfs_fstype = hyprlofsfstype;
+	vfsp->vfs_dev = hm->hlm_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+	hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+	(void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root hlnode structure */
+	bzero(&rattr, sizeof (vattr_t));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+	hyprlofs_node_init(hm, hp, &rattr, cr);
+
+	/* Get the mode, uid, and gid from the underlying mount point. */
+	rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+	HLNTOV(hp)->v_flag |= VROOT;
+
+	/*
+	 * If the getattr succeeded, use its results, otherwise allow the
+	 * previously set defaults to prevail.
+	 */
+	if (got_attrs == 0) {
+		hp->hln_mode = rattr.va_mode;
+		hp->hln_uid = rattr.va_uid;
+		hp->hln_gid = rattr.va_gid;
+	}
+
+	/*
+	 * Initialize linked list of hlnodes so that the back pointer of the
+	 * root hlnode always points to the last one on the list and the
+	 * forward pointer of the last node is null
+	 */
+	hp->hln_back = hp;
+	hp->hln_forw = NULL;
+	hp->hln_nlink = 0;
+	hm->hlm_rootnode = hp;
+
+	hyprlofs_dirinit(hp, hp);
+
+	rw_exit(&hp->hln_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+
+out:
+	return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hnp, *cancel;
+	vnode_t	*vp;
+	int error;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	/*
+	 * forced unmount is not supported by this file system
+	 * and thus, ENOTSUP, is being returned.
+	 */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	mutex_enter(&hm->hlm_contents);
+
+	/*
+	 * If there are no open files, only the root node should have a ref cnt.
+	 * With hlm_contents held, nothing can be added or removed. There may
+	 * be some dirty pages.  To prevent fsflush from disrupting the unmount,
+	 * put a hold on each node while scanning. If we find a previously
+	 * referenced node, undo the holds we have placed and fail EBUSY.
+	 */
+	hnp = hm->hlm_rootnode;
+	if (HLNTOV(hnp)->v_count > 1) {
+		mutex_exit(&hm->hlm_contents);
+		return (EBUSY);
+	}
+
+	for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+		if ((vp = HLNTOV(hnp))->v_count > 0) {
+			cancel = hm->hlm_rootnode->hln_forw;
+			while (cancel != hnp) {
+				vp = HLNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->hln_forw;
+			}
+			mutex_exit(&hm->hlm_contents);
+			return (EBUSY);
+		}
+		VN_HOLD(vp);
+	}
+
+	/* We can drop the mutex now because no one can find this mount */
+	mutex_exit(&hm->hlm_contents);
+
+	/*
+	 * Free all alloc'd memory associated with this FS. To do this, we go
+	 * through the file list twice, once to remove all the dir entries, and
+	 * then to remove all the files.
+	 */
+
+	/* Remove all directory entries */
+	for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+		rw_enter(&hnp->hln_rwlock, RW_WRITER);
+		if (hnp->hln_type == VDIR)
+			hyprlofs_dirtrunc(hnp);
+		rw_exit(&hnp->hln_rwlock);
+	}
+
+	ASSERT(hm->hlm_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place. VN_RELE
+	 * should make the node disappear, unless somebody is holding pages
+	 * against it.  Wait and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+	 * from blowing it away (in hyprlofs_inactive) while we're trying to
+	 * get to it here. Once we have a HOLD on it we know it'll stick around.
+	 */
+	mutex_enter(&hm->hlm_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+		mutex_exit(&hm->hlm_contents);
+		/* Note we handled the link count in pass 2 above. */
+		vp = HLNTOV(hnp);
+		VN_RELE(vp);
+		mutex_enter(&hm->hlm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again.
+		 */
+		if (hnp == hm->hlm_rootnode->hln_back) {
+			VN_HOLD(vp);
+			mutex_exit(&hm->hlm_contents);
+			delay(hz / 4);
+			mutex_enter(&hm->hlm_contents);
+		}
+	}
+	mutex_exit(&hm->hlm_contents);
+
+	VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+	ASSERT(hm->hlm_mntpath);
+
+	kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+	mutex_destroy(&hm->hlm_contents);
+	kmem_free(hm, sizeof (hlfsmount_t));
+
+	return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = hm->hlm_rootnode;
+	vnode_t *vp;
+
+	ASSERT(hp);
+
+	vp = HLNTOV(hp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	/*
+	 * The FS may have been mounted by the GZ on behalf of the NGZ.  In
+	 * that case, the hlfsmount zone_id will be the global zone.  We want
+	 * to show the swap cap inside the zone in this case, even though the
+	 * FS was mounted by the GZ.
+	 */
+	if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+		zp = curproc->p_zone;
+	else
+		zp = hm->hlm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > hyprlofs_minfree)
+		sbp->f_bfree = blocks - hyprlofs_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is what's available plus what's been used
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a NGZ with a swap cap, then report the
+		 * capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * This is fairly inaccurate since it doesn't take into account the
+	 * names stored in the directory entries.
+	 */
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+	    (sizeof (hlnode_t) + sizeof (hldirent_t));
+
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+	/*
+	 * ensure null termination
+	 */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+	hlfid_t *hfid;
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = NULL;
+
+	hfid = (hlfid_t *)fidp;
+	*vpp = NULL;
+
+	mutex_enter(&hm->hlm_contents);
+	for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+		mutex_enter(&hp->hln_tlock);
+		if (hp->hln_nodeid == hfid->hlfid_ino) {
+			/*
+			 * If the gen numbers don't match we know the file
+			 * won't be found since only one hlnode can have this
+			 * number at a time.
+			 */
+			if (hp->hln_gen != hfid->hlfid_gen ||
+			    hp->hln_nlink == 0) {
+				mutex_exit(&hp->hln_tlock);
+				mutex_exit(&hm->hlm_contents);
+				return (0);
+			}
+			*vpp = (vnode_t *)HLNTOV(hp);
+
+			VN_HOLD(*vpp);
+
+			if ((hp->hln_mode & S_ISVTX) &&
+			    !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+				mutex_enter(&(*vpp)->v_lock);
+				(*vpp)->v_flag |= VISSWAP;
+				mutex_exit(&(*vpp)->v_lock);
+			}
+			mutex_exit(&hp->hln_tlock);
+			mutex_exit(&hm->hlm_contents);
+			return (0);
+		}
+		mutex_exit(&hp->hln_tlock);
+	}
+	mutex_exit(&hm->hlm_contents);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..a2064dfa1f
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1441 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+		caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+		int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call.  This is only used to validate that
+ * the input list looks sane.
+ */
+#define	MAX_IOCTL_PARAMS	100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t *rvp;
+	int error;
+
+	rvp = REALVP(*vpp);
+
+	if (VTOHLN(*vpp)->hln_looped == 0)
+		return (0);
+
+	/*
+	 * looped back, pass through to real vnode. Need to hold new reference
+	 * to vp since VOP_OPEN() may decide to release it.
+	 */
+	VN_HOLD(rvp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+	ASSERT(rvp->v_count > 1);
+	VN_RELE(rvp);
+
+	return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0) {
+		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+		cleanshares(vp, ttoproc(curthread)->p_pid);
+		return (0);
+	}
+
+	return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+	return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* We don't support writing to non-regular files */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	int len, cnt, error;
+	int i;
+	model_t model;
+	char path[MAXPATHLEN];
+	char nm[MAXPATHLEN];
+
+	/* We only support the hyprlofs ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/*
+	 * Check if managing hyprlofs is allowed.
+	 */
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+		model = get_udatamodel();
+
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_entries_t ebuf;
+			hyprlofs_entry_t *e;
+
+			if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+				return (EFAULT);
+			cnt = ebuf.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry_t) * cnt;
+
+			e = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(ebuf.hle_entries), e, len)) {
+				kmem_free(e, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e[i].hle_nlen == 0 ||
+				    e[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+				    != 0) {
+					kmem_free(e, len);
+					return (EFAULT);
+				}
+				nm[e[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e[i].hle_plen == 0 ||
+					    e[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin(e[i].hle_path, path,
+					    e[i].hle_plen) != 0) {
+						kmem_free(e, len);
+						return (EFAULT);
+					}
+					path[e[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e, len);
+			return (0);
+
+		} else {
+			hyprlofs_entries32_t ebuf32;
+			hyprlofs_entry32_t *e32;
+
+			if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+				return (EFAULT);
+
+			cnt = ebuf32.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry32_t) * cnt;
+
+			e32 = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+			    e32, len)) {
+				kmem_free(e32, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e32[i].hle_nlen == 0 ||
+				    e32[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin((void *)(unsigned long)
+				    e32[i].hle_name, nm,
+				    e32[i].hle_nlen) != 0) {
+					kmem_free(e32, len);
+					return (EFAULT);
+				}
+				nm[e32[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e32[i].hle_plen == 0 ||
+					    e32[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin((void *)(unsigned long)
+					    e32[i].hle_path, path,
+					    e32[i].hle_plen) != 0) {
+						kmem_free(e32, len);
+						return (EFAULT);
+					}
+					path[e32[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e32, len);
+			return (0);
+		}
+	}
+
+	if (cmd == HYPRLOFS_RM_ALL) {
+		return (hyprlofs_rm_all(vp, cr, ct, flag));
+	}
+
+	if (cmd == HYPRLOFS_GET_ENTRIES) {
+		return (hyprlofs_get_all(vp, data, cr, ct, flag));
+	}
+
+	return (ENOTTY);
+}
+
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	vattr_t tmp_va;
+
+	if (tp->hln_looped == 1) {
+		int error;
+
+		if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr,
+		    ct)) != 0)
+			return (error);
+	}
+
+	mutex_enter(&tp->hln_tlock);
+	vap->va_type = vp->v_type;
+	vap->va_mode = tp->hln_mode & MODEMASK;
+	vap->va_uid = tp->hln_uid;
+	vap->va_gid = tp->hln_gid;
+	vap->va_fsid = tp->hln_fsid;
+	vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+	vap->va_nlink = tp->hln_nlink;
+	vap->va_size = (u_offset_t)tp->hln_size;
+	vap->va_atime = tp->hln_atime;
+	vap->va_mtime = tp->hln_mtime;
+	vap->va_ctime = tp->hln_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = tp->hln_rdev;
+	vap->va_seq = tp->hln_seq;
+
+	if (tp->hln_looped == 1) {
+		vap->va_nblocks = tmp_va.va_nblocks;
+	} else {
+		vap->va_nblocks =
+		    (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	}
+	mutex_exit(&tp->hln_tlock);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error = 0;
+	vattr_t *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+		return (EINVAL);
+
+	mutex_enter(&tp->hln_tlock);
+
+	get = &tp->hln_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+	    hyprlofs_taccess, tp);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&tp->hln_ctime);
+
+out:
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error;
+
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+	mutex_enter(&tp->hln_tlock);
+	error = hyprlofs_taccess(tp, mode, cr);
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *ntp = NULL;
+	int error;
+
+	if (VTOHLN(dvp)->hln_looped == 1)
+		return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp));
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(tp);
+
+	if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+		ASSERT(ntp);
+		*vpp = HLNTOV(ntp);
+	}
+	return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+    int mode, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *parent;
+	hlfsmount_t *tm;
+	int error;
+	hlnode_t *oldtp;
+	vnode_t *vp;
+
+	parent = (hlnode_t *)VTOHLN(dvp);
+	tm = (hlfsmount_t *)VTOHLM(dvp);
+	error = 0;
+	oldtp = NULL;
+
+	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+		/* we don't support the sticky bit */
+		vap->va_mode &= ~VSVTX;
+	} else if (vap->va_type == VNON) {
+		return (EINVAL);
+	}
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		oldtp = parent;
+	} else {
+		error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+	}
+
+	if (error == 0) {	/* name found */
+		ASSERT(oldtp);
+
+		rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+		/*
+		 * if create/read-only an existing directory, allow it
+		 */
+		if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+			error = EISDIR;
+		else {
+			error = hyprlofs_taccess(oldtp, mode, cr);
+		}
+
+		if (error) {
+			rw_exit(&oldtp->hln_rwlock);
+			hlnode_rele(oldtp);
+			return (error);
+		}
+
+		vp = HLNTOV(oldtp);
+		rw_exit(&oldtp->hln_rwlock);
+
+		if (vp->v_type == VREG) {
+			hlnode_rele(oldtp);
+			return (EEXIST);
+		}
+
+		vnevent_create(vp, ct);
+		return (0);
+	}
+
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+	    cr);
+	rw_exit(&parent->hln_rwlock);
+
+	return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT return
+	 * from hyprlofs_dirlookup() is a valid return.
+	 */
+	if (parent->hln_nlink == 0)
+		return (ENOENT);
+
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error == 0) {
+		ASSERT(self);
+		hlnode_rele(self);
+		/* We can't loop in under a looped in directory */
+		if (self->hln_looped)
+			return (EACCES);
+		*vpp = HLNTOV(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+	    va, &self, cr);
+	rw_exit(&parent->hln_rwlock);
+
+	if (error == 0 || error == EEXIST) {
+		hlnode_rele(self);
+		*vpp = HLNTOV(self);
+	}
+
+	return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+    cred_t *cr, caller_context_t *ct)
+{
+	int error;
+	char *p, *pnm;
+	vnode_t *realvp, *dvp;
+	vattr_t va;
+
+	/*
+	 * Get vnode for the real file/dir. We'll have a hold on realvp which
+	 * we won't vn_rele until hyprlofs_inactive.
+	 */
+	if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &realvp)) != 0)
+		return (error);
+
+	/* no devices allowed */
+	if (IS_DEVVP(realvp)) {
+		VN_RELE(realvp);
+		return (ENODEV);
+	}
+
+	/*
+	 * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+	 * to trigger the mount of the intended filesystem. This causes a
+	 * loopback mount of the intended filesystem instead of the AUTOFS
+	 * filesystem.
+	 */
+	if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/*
+	 * We're interested in the top most filesystem. This is specially
+	 * important when fspath is a trigger AUTOFS node, since we're really
+	 * interested in mounting the filesystem AUTOFS mounted as result of
+	 * the VOP_ACCESS() call not the AUTOFS node itself.
+	 */
+	if (vn_mountedvfs(realvp) != NULL) {
+		if ((error = traverse(&realvp)) != 0) {
+			VN_RELE(realvp);
+			return (error);
+		}
+	}
+
+	va.va_type = VNON;
+	/*
+	 * If the target name is a path, make sure we have all of the
+	 * intermediate directories, creating them if necessary.
+	 */
+	dvp = vp;
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		if (va.va_type == VNON)
+			/* use the top-level dir as the template va for mkdir */
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+				VN_RELE(realvp);
+				return (error);
+			}
+
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) {
+			VN_RELE(realvp);
+			return (EINVAL);
+		}
+
+		if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+		    error != EEXIST) {
+			VN_RELE(realvp);
+			return (error);
+		}
+
+		*p = '/';
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	/* Now use the real file's va as the template va */
+	if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/* Make the vnode */
+	error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+	if (error != 0)
+		VN_RELE(realvp);
+	return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error;
+	char *p, *pnm;
+	hlnode_t *parent;
+	hlnode_t *fndtp;
+
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	/*
+	 * If the target name is a path, get the containing dir and simple
+	 * file name.
+	 */
+	parent = (hlnode_t *)VTOHLN(dvp);
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' ||
+		    (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+			return (error);
+
+		dvp = HLNTOV(fndtp);
+		parent = fndtp;
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Remove the entry from the parent dir */
+	return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error = 0;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively remove contents of this subdir */
+			if (fndhp->hln_type == VDIR) {
+				vnode_t *tvp = HLNTOV(fndhp);
+
+				error = hyprlofs_rm_all(tvp, cr, ct, flags);
+				if (error != 0)
+					goto done;
+			}
+		}
+
+		/* remove the entry */
+		error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+		if (error != 0)
+			goto done;
+
+		hdp = hp->hln_dir;
+	}
+
+done:
+	hlnode_rele(hp);
+	return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+    char *prefix, int *pcnt, int n_max,
+    cred_t *cr, caller_context_t *ct, int flags)
+{
+	int error = 0;
+	int too_big = 0;
+	int cnt;
+	int len;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+	char *path;
+
+	cnt = *pcnt;
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+		vnode_t *tvp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively get contents of this subdir */
+			VERIFY(fndhp->hln_type == VDIR);
+			tvp = HLNTOV(fndhp);
+
+			if (*prefix == '\0')
+				(void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+			else
+				(void) snprintf(path, MAXPATHLEN, "%s/%s",
+				    prefix, hdp->hld_name);
+
+			error = hyprlofs_get_all_entries(tvp, hcp, path,
+			    &cnt, n_max, cr, ct, flags);
+
+			if (error == E2BIG) {
+				too_big = 1;
+				error = 0;
+			}
+			if (error != 0)
+				goto done;
+		} else {
+			if (cnt < n_max) {
+				char *p;
+
+				if (*prefix == '\0')
+					(void) strlcpy(path, hdp->hld_name,
+					    MAXPATHLEN);
+				else
+					(void) snprintf(path, MAXPATHLEN,
+					    "%s/%s", prefix, hdp->hld_name);
+
+				len = strlen(path);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(path, (void *)(hcp[cnt].hce_name),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+
+				tvp = REALVP(HLNTOV(fndhp));
+				if (tvp->v_path == vn_vpath_empty) {
+					p = "<unknown>";
+				} else {
+					p = tvp->v_path;
+				}
+				len = strlen(p);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(p, (void *)(hcp[cnt].hce_path),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+			}
+
+			cnt++;
+			if (cnt > n_max)
+				too_big = 1;
+		}
+
+		hdp = hdp->hld_next;
+	}
+
+done:
+	hlnode_rele(hp);
+	kmem_free(path, MAXPATHLEN);
+
+	*pcnt = cnt;
+	if (error == 0 && too_big == 1)
+		error = E2BIG;
+
+	return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int limit, cnt, error;
+	model_t model;
+	hyprlofs_curr_entry_t *e;
+
+	model = get_udatamodel();
+
+	if (model == DATAMODEL_NATIVE) {
+		hyprlofs_curr_entries_t ebuf;
+
+		if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+			return (EFAULT);
+		limit = ebuf.hce_cnt;
+		e = ebuf.hce_entries;
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+
+	} else {
+		hyprlofs_curr_entries32_t ebuf32;
+
+		if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+			return (EFAULT);
+
+		limit = ebuf32.hce_cnt;
+		e = (hyprlofs_curr_entry_t *)(unsigned long)
+		    (ebuf32.hce_entries);
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+	}
+
+	cnt = 0;
+	error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+	    flags);
+
+	if (error == 0 || error == E2BIG) {
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_curr_entries_t ebuf;
+
+			ebuf.hce_cnt = cnt;
+			if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+				return (EFAULT);
+
+		} else {
+			hyprlofs_curr_entries32_t ebuf32;
+
+			ebuf32.hce_cnt = cnt;
+			if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+				return (EFAULT);
+		}
+	}
+
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	int error;
+	hlnode_t *hp = NULL;
+
+	/* This holds the hp vnode */
+	error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+	if (error)
+		return (error);
+
+	ASSERT(hp);
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+	rw_exit(&hp->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(hp);
+
+	return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	vnode_t *vp;
+	int error = 0;
+
+	/* Return error if removing . or .. */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&self->hln_rwlock, RW_WRITER);
+
+	vp = HLNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done1;
+	}
+	if (self->hln_type != VDIR) {
+		error = ENOTDIR;
+		goto done1;
+	}
+
+	/*
+	 * When a dir is looped in, we only remove the in-memory dir, not the
+	 * backing dir.
+	 */
+	if (self->hln_looped == 0) {
+		mutex_enter(&self->hln_tlock);
+		if (self->hln_nlink > 2) {
+			mutex_exit(&self->hln_tlock);
+			error = EEXIST;
+			goto done1;
+		}
+		mutex_exit(&self->hln_tlock);
+
+		if (vn_vfswlock(vp)) {
+			error = EBUSY;
+			goto done1;
+		}
+		if (vn_mountedvfs(vp) != NULL) {
+			error = EBUSY;
+			goto done;
+		}
+
+		/*
+		 * Check for an empty directory, i.e. only includes entries for
+		 * "." and ".."
+		 */
+		if (self->hln_dirents > 2) {
+			error = EEXIST;		/* SIGH should be ENOTEMPTY */
+			/*
+			 * Update atime because checking hln_dirents is
+			 * equivalent to reading the directory
+			 */
+			gethrestime(&self->hln_atime);
+			goto done;
+		}
+
+		error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+	} else {
+		error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+	}
+
+done:
+	if (self->hln_looped == 0)
+		vn_vfsunlock(vp);
+done1:
+	rw_exit(&self->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(self);
+
+	return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hldirent_t *hdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	long outcount = 0;
+	long bufsize;
+	int reclen;
+	caddr_t outbuf;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+	/* assuming syscall has already called hln_rwlock */
+	ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		return (0);
+	}
+
+	/* Get space for multiple dir entries */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+	offset = 0;
+	hdp = hp->hln_dir;
+	while (hdp) {
+		namelen = strlen(hdp->hld_name);	/* no +1 needed */
+		offset = hdp->hld_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = (int)DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount)
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				break;
+			}
+			ASSERT(hdp->hld_hlnode != NULL);
+
+			/* zero out uninitialized bytes */
+			(void) strncpy(dp->d_name, hdp->hld_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+			dp->d_off = (offset_t)hdp->hld_offset + 1;
+			dp = (struct dirent64 *)
+			    ((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		hdp = hdp->hld_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!hdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&hp->hln_atime);
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	mutex_enter(&hp->hln_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's nothing to do except drop our hold.
+	 */
+	if (vp->v_count > 1 || hp->hln_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&hp->hln_tlock);
+		rw_exit(&hp->hln_rwlock);
+		return;
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&hp->hln_tlock);
+
+	/* release hold on the real vnode now */
+	if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+		VN_RELE(hp->hln_realvp);
+
+	/* Here's our chance to send invalid event while we're between locks */
+	vn_invalid(HLNTOV(hp));
+
+	mutex_enter(&hm->hlm_contents);
+	if (hp->hln_forw == NULL)
+		hm->hlm_rootnode->hln_back = hp->hln_back;
+	else
+		hp->hln_forw->hln_back = hp->hln_back;
+	hp->hln_back->hln_forw = hp->hln_forw;
+	mutex_exit(&hm->hlm_contents);
+	rw_exit(&hp->hln_rwlock);
+	rw_destroy(&hp->hln_rwlock);
+	mutex_destroy(&hp->hln_tlock);
+	vn_free(HLNTOV(hp));
+	kmem_free(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfid_t *hfid;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FID(REALVP(vp), fidp, ct));
+
+	if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+		fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+		return (ENOSPC);
+	}
+
+	hfid = (hlfid_t *)fidp;
+	bzero(hfid, sizeof (hlfid_t));
+	hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+	hfid->hlfid_ino = hp->hln_nodeid;
+	hfid->hlfid_gen = hp->hln_gen;
+
+	return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+	    rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	/* return EACCES to be consistent with mmap */
+	if (VTOHLN(vp)->hln_looped != 1)
+		return (EACCES);
+	return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0)
+		return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+	return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1)
+		return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+	if (write_lock) {
+		rw_enter(&hp->hln_rwlock, RW_WRITER);
+	} else {
+		rw_enter(&hp->hln_rwlock, RW_READER);
+	}
+	return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1) {
+		VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+		return;
+	}
+
+	rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+	switch (cmd) {
+	case _PC_XATTR_ENABLED:
+	case _PC_XATTR_EXISTS:
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		error = EINVAL;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = hyprlofs_open },
+	VOPNAME_CLOSE,		{ .vop_close = hyprlofs_close },
+	VOPNAME_READ,		{ .vop_read = hyprlofs_read },
+	VOPNAME_WRITE,		{ .vop_write = hyprlofs_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = hyprlofs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = hyprlofs_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = hyprlofs_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = hyprlofs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = hyprlofs_lookup },
+	VOPNAME_CREATE,		{ .error = fs_error },
+	VOPNAME_REMOVE,		{ .vop_remove = hyprlofs_remove },
+	VOPNAME_LINK,		{ .error = fs_error },
+	VOPNAME_RENAME,		{ .error = fs_error },
+	VOPNAME_MKDIR,		{ .error = fs_error },
+	VOPNAME_RMDIR,		{ .vop_rmdir = hyprlofs_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = hyprlofs_readdir },
+	VOPNAME_SYMLINK,	{ .error = fs_error },
+	VOPNAME_READLINK,	{ .error = fs_error },
+	VOPNAME_FSYNC,		{ .vop_fsync = hyprlofs_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = hyprlofs_inactive },
+	VOPNAME_FID,		{ .vop_fid = hyprlofs_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = hyprlofs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = hyprlofs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = hyprlofs_seek },
+	VOPNAME_SPACE,		{ .vop_space = hyprlofs_space },
+	VOPNAME_GETPAGE,	{ .vop_getpage = hyprlofs_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = hyprlofs_putpage },
+	VOPNAME_MAP,		{ .vop_map = hyprlofs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = hyprlofs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = hyprlofs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = hyprlofs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 55ffb94805..59ec5d1829 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
@@ -57,6 +58,7 @@
 #include <sys/zone.h>
 #include <sys/dnlc.h>
 #include <sys/fs/snode.h>
+#include <sys/brand.h>
 
 /* Controls whether paths are stored with vnodes. */
 int vfs_vnode_path = 1;
@@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr)
 }
 
 /*
+ * Clean a stale v_path from a vnode.  This is only performed if the v_path has
+ * not been altered since it was found to be stale
+ */
+static void
+vnode_clear_vpath(vnode_t *vp, char *vpath_old)
+{
+	mutex_enter(&vp->v_lock);
+	if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) {
+		vp->v_path = vn_vpath_empty;
+		mutex_exit(&vp->v_lock);
+		kmem_free(vpath_old, strlen(vpath_old) + 1);
+	} else {
+		mutex_exit(&vp->v_lock);
+	}
+}
+
+/*
+ * Validate that a pathname refers to a given vnode.
+ */
+static int
+vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
+    int flags, cred_t *cr)
+{
+	vnode_t *compvp;
+	/*
+	 * If we are in a zone or a chroot environment, then we have to
+	 * take additional steps, since the path to the root might not
+	 * be readable with the current credentials, even though the
+	 * process can legitmately access the file.  In this case, we
+	 * do the following:
+	 *
+	 * lookuppnvp() with all privileges to get the resolved path.
+	 * call localpath() to get the local portion of the path, and
+	 * continue as normal.
+	 *
+	 * If the the conversion to a local path fails, then we continue
+	 * as normal.  This is a heuristic to make process object file
+	 * paths available from within a zone.  Because lofs doesn't
+	 * support page operations, the vnode stored in the seg_t is
+	 * actually the underlying real vnode, not the lofs node itself.
+	 * Most of the time, the lofs path is the same as the underlying
+	 * vnode (for example, /usr/lib/libc.so.1).
+	 */
+	if (vrootp != rootdir) {
+		char *local = NULL;
+
+		VN_HOLD(rootdir);
+		if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir,
+		    rootdir, kcred) == 0) {
+			local = localpath(rpn->pn_path, vrootp, kcred);
+			VN_RELE(compvp);
+		}
+
+		/*
+		 * The original pn was changed through lookuppnvp().
+		 * Set it to local for next validation attempt.
+		 */
+		if (local) {
+			(void) pn_set(pn, local);
+		} else {
+			return (1);
+		}
+	}
+
+	/*
+	 * We should have a local path at this point, so start the search from
+	 * the root of the current process.
+	 */
+	VN_HOLD(vrootp);
+	if (vrootp != rootdir)
+		VN_HOLD(vrootp);
+	if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
+	    cr) == 0) {
+		/*
+		 * Check to see if the returned vnode is the same as the one we
+		 * expect.
+		 */
+		if (vn_compare(vp, compvp) ||
+		    vnode_match(vp, compvp, cr)) {
+			VN_RELE(compvp);
+			return (0);
+		} else {
+			VN_RELE(compvp);
+		}
+	}
+
+	return (1);
+}
+
+/*
  * Given a directory, return the full, resolved path.  This looks up "..",
  * searches for the given vnode in the parent, appends the component, etc.  It
  * is used to implement vnodetopath() and getcwd() when the cached path fails.
@@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 	char		*bufloc;
 	size_t		dlen = DIRENT64_RECLEN(MAXPATHLEN);
 	refstr_t	*mntpt;
+	char *vpath_cached;
+	boolean_t vpath_stale;
 
 	/* Operation only allowed on directories */
 	ASSERT(vp->v_type == VDIR);
@@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		 * Shortcut: see if this vnode has correct v_path. If so,
 		 * we have the work done.
 		 */
+		vpath_cached = NULL;
+		vpath_stale = B_FALSE;
 		mutex_enter(&vp->v_lock);
-		if (vp->v_path != NULL) {
-
-			if ((err = pn_set(&pn, vp->v_path)) == 0) {
-				mutex_exit(&vp->v_lock);
-				rpn.pn_path = rpn.pn_buf;
-
-				/*
-				 * Ensure the v_path pointing to correct vnode
-				 */
-				VN_HOLD(vrootp);
-				if (vrootp != rootdir)
-					VN_HOLD(vrootp);
-				if (lookuppnvp(&pn, &rpn, flags, NULL,
-				    &cmpvp, vrootp, vrootp, cr) == 0) {
-
-					if (VN_CMP(vp, cmpvp)) {
-						VN_RELE(cmpvp);
+		if (vp->v_path != vn_vpath_empty &&
+		    pn_set(&pn, vp->v_path) == 0) {
+			vpath_cached = vp->v_path;
+			mutex_exit(&vp->v_lock);
+			rpn.pn_path = rpn.pn_buf;
 
-						complen = strlen(rpn.pn_path);
-						bufloc -= complen;
-						if (bufloc < buf) {
-							err = ERANGE;
-							goto out;
-						}
-						bcopy(rpn.pn_path, bufloc,
-						    complen);
-						break;
-					} else {
-						VN_RELE(cmpvp);
-					}
+			/* Ensure the v_path pointing to correct vnode */
+			if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags,
+			    cr) == 0) {
+				complen = strlen(rpn.pn_path);
+				bufloc -= complen;
+				if (bufloc < buf) {
+					err = ERANGE;
+					goto out;
 				}
+				bcopy(rpn.pn_path, bufloc, complen);
+				break;
 			} else {
-				mutex_exit(&vp->v_lock);
+				vpath_stale = B_TRUE;
 			}
 		} else {
 			mutex_exit(&vp->v_lock);
@@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		}
 
 		/*
-		 * Try to obtain the path component from dnlc cache
-		 * before searching through the directory.
-		 */
-		if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) {
-			/*
-			 * If we got parent vnode as a result,
-			 * then the answered path is correct.
-			 */
-			if (VN_CMP(cmpvp, pvp)) {
-				VN_RELE(cmpvp);
-				complen = strlen(dbuf);
-				bufloc -= complen;
-				if (bufloc <= buf) {
-					err = ENAMETOOLONG;
-					goto out;
-				}
-				bcopy(dbuf, bufloc, complen);
-
-				/* Prepend a slash to the current path */
-				*--bufloc = '/';
-
-				/* And continue with the next component */
-				VN_RELE(vp);
-				vp = pvp;
-				pvp = NULL;
-				continue;
-			} else {
-				VN_RELE(cmpvp);
-			}
-		}
-
-		/*
 		 * Search the parent directory for the entry corresponding to
 		 * this vnode.
 		 */
@@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
 		/* Prepend a slash to the current path.  */
 		*--bufloc = '/';
 
+		/* Clear vp->v_path if it was found to be stale. */
+		if (vpath_stale == B_TRUE) {
+			vnode_clear_vpath(vp, vpath_cached);
+		}
+
 		/* And continue with the next component */
 		VN_RELE(vp);
 		vp = pvp;
@@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
 			VN_RELE(vp);
 	}
 
-	pn_alloc(&pn);
 
 	/*
-	 * Check to see if we have a cached path in the vnode.
+	 * Check to see if we have a valid cached path in the vnode.
 	 */
+	pn_alloc(&pn);
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		(void) pn_set(&pn, vp->v_path);
 		mutex_exit(&vp->v_lock);
 
-		pn_alloc(&rpn);
-
 		/* We should only cache absolute paths */
 		ASSERT(pn.pn_buf[0] == '/');
 
-		/*
-		 * If we are in a zone or a chroot environment, then we have to
-		 * take additional steps, since the path to the root might not
-		 * be readable with the current credentials, even though the
-		 * process can legitmately access the file.  In this case, we
-		 * do the following:
-		 *
-		 * lookuppnvp() with all privileges to get the resolved path.
-		 * call localpath() to get the local portion of the path, and
-		 * continue as normal.
-		 *
-		 * If the the conversion to a local path fails, then we continue
-		 * as normal.  This is a heuristic to make process object file
-		 * paths available from within a zone.  Because lofs doesn't
-		 * support page operations, the vnode stored in the seg_t is
-		 * actually the underlying real vnode, not the lofs node itself.
-		 * Most of the time, the lofs path is the same as the underlying
-		 * vnode (for example, /usr/lib/libc.so.1).
-		 */
-		if (vrootp != rootdir) {
-			char *local = NULL;
-			VN_HOLD(rootdir);
-			if (lookuppnvp(&pn, &rpn, FOLLOW,
-			    NULL, &compvp, rootdir, rootdir, kcred) == 0) {
-				local = localpath(rpn.pn_path, vrootp,
-				    kcred);
-				VN_RELE(compvp);
-			}
-
-			/*
-			 * The original pn was changed through lookuppnvp().
-			 * Set it to local for next validation attempt.
-			 */
-			if (local) {
-				(void) pn_set(&pn, local);
-			} else {
-				goto notcached;
+		pn_alloc(&rpn);
+		if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) {
+			/* Return the result, if we're able. */
+			if (buflen > rpn.pn_pathlen) {
+				bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
+				pn_free(&pn);
+				pn_free(&rpn);
+				VN_RELE(vrootp);
+				if (doclose) {
+					(void) VOP_CLOSE(vp, FREAD, 1, 0, cr,
+					    NULL);
+					VN_RELE(vp);
+				}
+				return (0);
 			}
 		}
-
 		/*
-		 * We should have a local path at this point, so start the
-		 * search from the root of the current process.
+		 * A stale v_path will be purged by the later dirtopath lookup.
 		 */
-		VN_HOLD(vrootp);
-		if (vrootp != rootdir)
-			VN_HOLD(vrootp);
-		ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL,
-		    &compvp, vrootp, vrootp, cr);
-		if (ret == 0) {
-			/*
-			 * Check to see if the returned vnode is the same as
-			 * the one we expect.  If not, give up.
-			 */
-			if (!vn_compare(vp, compvp) &&
-			    !vnode_match(vp, compvp, cr)) {
-				VN_RELE(compvp);
-				goto notcached;
-			}
-
-			VN_RELE(compvp);
-
-			/*
-			 * Return the result.
-			 */
-			if (buflen <= rpn.pn_pathlen)
-				goto notcached;
-
-			bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
-			pn_free(&pn);
-			pn_free(&rpn);
-			VN_RELE(vrootp);
-			if (doclose) {
-				(void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
-				VN_RELE(vp);
-			}
-			return (0);
-		}
-
-notcached:
 		pn_free(&rpn);
 	} else {
 		mutex_exit(&vp->v_lock);
 	}
-
 	pn_free(&pn);
 
 	if (vp->v_type != VDIR) {
-		/*
-		 * If we don't have a directory, try to find it in the dnlc via
-		 * reverse lookup.  Once this is found, we can use the regular
-		 * directory search to find the full path.
-		 */
-		if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) {
-			/*
-			 * Check if we have read privilege so, that
-			 * we can lookup the path in the directory
-			 */
-			ret = 0;
-			if ((flags & LOOKUP_CHECKREAD)) {
-				ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL);
-			}
-			if (ret == 0) {
-				ret = dirtopath(vrootp, pvp, buf, buflen,
-				    flags, cr);
-			}
-			if (ret == 0) {
-				len = strlen(buf);
-				if (len + strlen(path) + 1 >= buflen) {
-					ret = ENAMETOOLONG;
-				} else {
-					if (buf[len - 1] != '/')
-						buf[len++] = '/';
-					bcopy(path, buf + len,
-					    strlen(path) + 1);
-				}
-			}
-
-			VN_RELE(pvp);
-		} else
-			ret = ENOENT;
-	} else
+		ret = ENOENT;
+	} else {
 		ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
+	}
 
 	VN_RELE(vrootp);
 	if (doclose) {
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..3c1405d4af
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,524 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define	LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	int bufsize = lxpr_bufsize;
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process
+		 */
+		p = prfind((pid == 1) ?
+		    curproc->p_zone->zone_proc_initpid : pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		if (p->p_flag & SEXITING) {
+			/*
+			 * This process is exiting -- let it go.
+			 */
+			mutex_exit(mp);
+			return (NULL);
+		}
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (pid + 1);
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + fd);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		lxpnp->lxpr_pid = ((p->p_pid ==
+		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		break;
+
+	case LXPR_PID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lxproc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialize cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * allocate the first vnode
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+
+	(void) strcpy(sp->f_fstr, "lxproc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..48f4efc1bf
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3099 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * We have -- confusingly -- two implementations of Linux /proc.  One is to
+ * support the LX brand with a Linux /proc entirely compatible with the Linux
+ * world view; the other -- this one -- is to support native (but Linux-borne)
+ * programs that wish to view the native system via the Linux /proc model.  So
+ * the aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop).  However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth.  A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space.  (That is, each thread has in
+ * effect a pid.)  Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all.  Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility.  In short, this is meant to be a best
+ * effort -- no more -- and as such, it should not be unified with the much
+ * more complete Linux /proc implementation found in the LX brand.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" }
+};
+
+#define	PROCDIRFILES	(sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * 	====================================
+ * 	| Number | Linux      | Native     |
+ * 	| ====== | =========  | ========== |
+ *	|    7   | SIGBUS     | SIGEMT     |
+ *	|   10   | SIGUSR1    | SIGBUS     |
+ *	|   12   | SIGUSR2    | SIGSYS     |
+ *	|   16   | SIGSTKFLT  | SIGUSR1    |
+ *	|   17   | SIGCHLD    | SIGUSR2    |
+ * 	|   18   | SIGCONT    | SIGCHLD    |
+ *	|   19   | SIGSTOP    | SIGPWR     |
+ * 	|   20   | SIGTSTP    | SIGWINCH   |
+ * 	|   21   | SIGTTIN    | SIGURG     |
+ * 	|   22   | SIGTTOU    | SIGPOLL    |
+ *	|   23   | SIGURG     | SIGSTOP    |
+ * 	|   24   | SIGXCPU    | SIGTSTP    |
+ *	|   25   | SIGXFSZ    | SIGCONT    |
+ *	|   26   | SIGVTALARM | SIGTTIN    |
+ *	|   27   | SIGPROF    | SIGTTOU    |
+ *	|   28   | SIGWINCH   | SIGVTALARM |
+ *	|   29   | SIGPOLL    | SIGPROF    |
+ *	|   30   | SIGPWR     | SIGXCPU    |
+ *	|   31   | SIGSYS     | SIGXFSZ    |
+ * 	====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+	0,
+	LX_SIGHUP,
+	LX_SIGINT,
+	LX_SIGQUIT,
+	LX_SIGILL,
+	LX_SIGTRAP,
+	LX_SIGABRT,
+	LX_SIGSTKFLT,
+	LX_SIGFPE,
+	LX_SIGKILL,
+	LX_SIGBUS,
+	LX_SIGSEGV,
+	LX_SIGSYS,
+	LX_SIGPIPE,
+	LX_SIGALRM,
+	LX_SIGTERM,
+	LX_SIGUSR1,
+	LX_SIGUSR2,
+	LX_SIGCHLD,
+	LX_SIGPWR,
+	LX_SIGWINCH,
+	LX_SIGURG,
+	LX_SIGPOLL,
+	LX_SIGSTOP,
+	LX_SIGTSTP,
+	LX_SIGCONT,
+	LX_SIGTTIN,
+	LX_SIGTTOU,
+	LX_SIGVTALRM,
+	LX_SIGPROF,
+	LX_SIGXCPU,
+	LX_SIGXFSZ,
+	-1,			/* 32:  illumos SIGWAITING */
+	-1,			/* 33:  illumos SIGLWP */
+	-1,			/* 34:  illumos SIGFREEZE */
+	-1,			/* 35:  illumos SIGTHAW */
+	-1,			/* 36:  illumos SIGCANCEL */
+	-1,			/* 37:  illumos SIGLOST */
+	-1,			/* 38:  illumos SIGXRES */
+	-1,			/* 39:  illumos SIGJVM1 */
+	-1,			/* 40:  illumos SIGJVM2 */
+	-1,			/* 41:  illumos SIGINFO */
+	LX_SIGRTMIN,		/* 42:  illumos _SIGRTMIN */
+	LX_SIGRTMIN + 1,
+	LX_SIGRTMIN + 2,
+	LX_SIGRTMIN + 3,
+	LX_SIGRTMIN + 4,
+	LX_SIGRTMIN + 5,
+	LX_SIGRTMIN + 6,
+	LX_SIGRTMIN + 7,
+	LX_SIGRTMIN + 8,
+	LX_SIGRTMIN + 9,
+	LX_SIGRTMIN + 10,
+	LX_SIGRTMIN + 11,
+	LX_SIGRTMIN + 12,
+	LX_SIGRTMIN + 13,
+	LX_SIGRTMIN + 14,
+	LX_SIGRTMIN + 15,
+	LX_SIGRTMIN + 16,
+	LX_SIGRTMIN + 17,
+	LX_SIGRTMIN + 18,
+	LX_SIGRTMIN + 19,
+	LX_SIGRTMIN + 20,
+	LX_SIGRTMIN + 21,
+	LX_SIGRTMIN + 22,
+	LX_SIGRTMIN + 23,
+	LX_SIGRTMIN + 24,
+	LX_SIGRTMIN + 25,
+	LX_SIGRTMIN + 26,
+	LX_SIGRTMIN + 27,
+	LX_SIGRTMIN + 28,
+	LX_SIGRTMIN + 29,
+	LX_SIGRTMIN + 30,
+	LX_SIGRTMAX
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/*
+	 * We only allow reading in this file systrem
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	/*
+	 * If we are opening an underlying file only allow regular files
+	 * reject the open for anything but a regular file.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+			error = EACCES;
+		else {
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_empty,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_empty,		/* /proc/devices	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_empty,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_invalid,		/* /proc/kmsg -- see lxpr_read() */
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type < LXPR_NFILES);
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		ldi_handle_t	ldih;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error =
+		    ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+
+		lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+		if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+			return (error);
+	} else {
+		lxpr_read_function[type](lxpnp, uiobuf);
+	}
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context.  This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+	lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+	lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		caddr_t saddr;
+		caddr_t eaddr;
+		int type;
+		char prot[5];
+		uint32_t offset;
+		vnode_t *vp;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = seg->s_base;
+		pbuf->eaddr = seg->s_base+seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		u_longlong_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (*buf != '\0') {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize;
+	size_t rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = btopr(as->a_resvsize);
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	user_t *up;
+	cred_t *cr;
+	const gid_t *groups;
+	int    ngroups;
+	struct as *as;
+	char *status;
+	pid_t pid, ppid;
+	size_t vsize;
+	size_t rss;
+	k_sigset_t current, ignore, handle;
+	int    i, lx_sig;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    up->u_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    pid,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    p->p_fno_ctl);
+
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		mutex_enter(&p->p_lock);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    0l,
+		    ptok(rss),
+		    0l,
+		    btok(p->p_stksize),
+		    ptok(rss),
+		    0l);
+	}
+
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = lxpr_sigmap[i];
+
+		if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "\n"
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n"
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0],
+	    /* Can't do anything with linux capabilities */
+	    0,
+	    0,
+	    0);
+
+	lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri;
+	caddr_t wchan;
+	processorid_t cpu;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = p->p_sessp->s_dev;
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S'; break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R'; break;
+		case TS_ZOMB:
+			stat = 'Z'; break;
+		case TS_STOPPED:
+			stat = 'T'; break;
+		default:
+			stat = '!'; break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+		thread_unlock(t);
+	} else {
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = as->a_resvsize;
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d (%s) %c %d %d %d %d %d "
+	    "%lu %lu %lu %lu %lu "
+	    "%lu %lu %ld %ld "
+	    "%d %d %d "
+	    "%lu "
+	    "%lu "
+	    "%lu %ld %llu "
+	    "%lu %lu %u "
+	    "%lu %lu "
+	    "%lu %lu %lu %lu "
+	    "%lu "
+	    "%lu %lu "
+	    "%d "
+	    "%d"
+	    "\n",
+	    pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+	    pri, nice, p->p_lwpcnt,
+	    0l, /* itrealvalue (time before next SIGALRM) */
+	    PTOU(p)->u_ticks,
+	    vsize, rss, p->p_vmem_ctl,
+	    0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+	    0l, 0l, /* kstkesp, kstkeip */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+	    wchan,
+	    0l, 0l, /* nswap, cnswap */
+	    0, /* exit_signal */
+	    cpu);
+
+	lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	/*
+	 * Data about each interface should go here, but that shouldn't be added
+	 * unless there is an lxproc reader that actually makes use of it (and
+	 * doesn't need anything else that we refuse to provide)...
+	 */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+	mblk_t		*mp;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+	if (ldi_getmsg(lh, &mp, NULL) == 0) {
+		/*
+		 * lxproc doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	long total_mem, free_mem, total_swap, used_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+		total_mem = physmem * PAGESIZE;
+		free_mem = freemem * PAGESIZE;
+	} else {
+		total_mem = zone->zone_phys_mem_ctl;
+		free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+	}
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = k_anoninfo.ani_max * PAGESIZE;
+		used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		used_swap = zone->zone_max_swap;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "        total:     used:    free:  shared: buffers:  cached:\n"
+	    "Mem:  %8lu %8lu %8lu %8u %8u %8u\n"
+	    "Swap: %8lu %8lu %8lu\n"
+	    "MemTotal:  %8lu kB\n"
+	    "MemFree:   %8lu kB\n"
+	    "MemShared: %8u kB\n"
+	    "Buffers:   %8u kB\n"
+	    "Cached:    %8u kB\n"
+	    "SwapCached:%8u kB\n"
+	    "Active:    %8u kB\n"
+	    "Inactive:  %8u kB\n"
+	    "HighTotal: %8u kB\n"
+	    "HighFree:  %8u kB\n"
+	    "LowTotal:  %8u kB\n"
+	    "LowFree:   %8u kB\n"
+	    "SwapTotal: %8lu kB\n"
+	    "SwapFree:  %8lu kB\n",
+	    total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+	    total_swap, used_swap, total_swap - used_swap,
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap - used_swap));		/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	zone_t *zone = LXPTOZ(lxpnp);
+	struct print_data {
+		refstr_t *vfs_mntpt;
+		refstr_t *vfs_resource;
+		uint_t vfs_flag;
+		int vfs_fstype;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *printp;
+
+	vfs_list_read_lock();
+
+	if (zone == global_zone) {
+		vfsp = vfslist = rootvfs;
+	} else {
+		vfsp = vfslist = zone->zone_vfslist;
+		/*
+		 * If the zone has a root entry, it will be the first in
+		 * the list.  If it doesn't, we conjure one up.
+		 */
+		if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+		    zone->zone_rootpath) != 0) {
+			struct vfs *tvfsp;
+			/*
+			 * The root of the zone is not a mount point.  The vfs
+			 * we want to report is that of the zone's root vnode.
+			 */
+			tvfsp = zone->zone_rootvp->v_vfsp;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "/ / %s %s 0 0\n",
+			    vfssw[tvfsp->vfs_fstype].vsw_name,
+			    tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+		}
+		if (vfslist == NULL) {
+			vfs_list_unlock();
+			return;
+		}
+	}
+
+	/*
+	 * Later on we have to do a lookupname, which can end up causing
+	 * another vfs_list_read_lock() to be called. Which can lead to a
+	 * deadlock. To avoid this, we extract the data we need into a local
+	 * list, then we can run this list without holding vfs_list_read_lock()
+	 * We keep the list in the same order as the vfs_list
+	 */
+	do {
+		/* Skip mounts we shouldn't show */
+		if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+			goto nextfs;
+		}
+
+		printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+		refstr_hold(vfsp->vfs_mntpt);
+		printp->vfs_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_resource);
+		printp->vfs_resource = vfsp->vfs_resource;
+		printp->vfs_flag = vfsp->vfs_flag;
+		printp->vfs_fstype = vfsp->vfs_fstype;
+		printp->next = NULL;
+
+		*print_tail = printp;
+		print_tail = &printp->next;
+
+nextfs:
+		vfsp = (zone == global_zone) ?
+		    vfsp->vfs_next : vfsp->vfs_zone_next;
+
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	printp = print_head;
+	while (printp != NULL) {
+		struct print_data *printp_next;
+		const char *resource;
+		char *mntpt;
+		struct vnode *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(printp->vfs_mntpt);
+		resource = refstr_value(printp->vfs_resource);
+
+		if (mntpt != NULL && mntpt[0] != '\0')
+			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		else
+			mntpt = "-";
+
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+		if (error != 0)
+			goto nextp;
+
+		if (!(vp->v_flag & VROOT)) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) :
+				    mntpt;
+			}
+		} else {
+			resource = "-";
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "%s %s %s %s 0 0\n",
+		    resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+		    printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+		printp_next = printp->next;
+		refstr_rele(printp->vfs_mntpt);
+		refstr_rele(printp->vfs_resource);
+		kmem_free(printp, sizeof (*printp));
+		printp = printp_next;
+
+	}
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices.  But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "major minor  #blocks  name     rio rmerge rsect ruse "
+	    "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.  Note that
+ * we don't lie here -- we don't pretend that we're Linux.  If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) "
+	    "#%s SMP %s\n",
+	    utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+	    "gcc",
+	    __GNUC__,
+	    __GNUC_MINOR__,
+	    __GNUC_PATCHLEVEL__,
+#else
+	    "Sun C",
+	    __SUNPRO_C / 0x100,
+	    (__SUNPRO_C & 0xff) / 0x10,
+	    __SUNPRO_C & 0xf,
+#endif
+	    utsname.version,
+	    "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	ulong_t cpu_nrunnable_cum = 0;
+	ulong_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+		w_io_cum += CPU_STATS(cp, sys.iowait);
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_cum += NSEC_TO_TICK(tmptime);
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+	    user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+		    cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+		    0L, irq_ticks, 0L);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "page %lu %lu\n"
+	    "swap %lu %lu\n"
+	    "intr %lu\n"
+	    "ctxt %lu\n"
+	    "btime %lu\n"
+	    "processes %lu\n"
+	    "procs_running %lu\n"
+	    "procs_blocked %lu\n",
+	    pgpgin_cum, pgpgout_cum,
+	    pgswapin_cum, pgswapout_cum,
+	    intr_cum,
+	    pswitch_cum,
+	    boot_time,
+	    forks_cum,
+	    cpu_nrunnable_cum,
+	    w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = vp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is a read only file system */
+	if (mode & VWRITE)
+		return (EROFS);
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (lxpnp->lxpr_type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+	int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	vnode_t *vp = NULL;
+	proc_t *p;
+	file_t *fp;
+	uint_t fd;
+	int c;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	/*
+	 * convert the string rendition of the filename
+	 * to a file descriptor
+	 */
+	fd = 0;
+	while ((c = *comp++) != '\0') {
+		int ofd;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		ofd = fd;
+		fd = 10*fd + c - '0';
+		/* integer overflow */
+		if (fd / 10 != ofd)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	p = lxpr_lock(dlxpnp->lxpr_pid);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * If the process is a zombie or system process
+	 * it can't have any open files.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/*
+	 * get us a fresh node/vnode
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we dereference into fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * get open file info
+	 */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		/*
+		 * ensure the fd is still kosher.
+		 * it may have gone between the readdir and
+		 * the lookup
+		 */
+		if (fip->fi_list[fd].uf_file == NULL) {
+			mutex_exit(&fip->fi_lock);
+			UF_EXIT(ufp);
+			mutex_enter(&p->p_lock);
+			lxpr_unlock(p);
+			lxpr_freenode(lxpnp);
+			return (NULL);
+		}
+
+		if ((fp = ufp->uf_file) != NULL)
+			vp = fp->f_vnode;
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+
+	if (vp == NULL) {
+		mutex_enter(&p->p_lock);
+		lxpr_unlock(p);
+		lxpr_freenode(lxpnp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxpnp->lxpr_realvp = vp;
+		VN_HOLD(lxpnp->lxpr_realvp);
+	}
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lxpr node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+	caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex >= 0 && dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			VERIFY(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i < 0 || i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc
+		 * structure
+		 */
+		pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+		    p->p_pid : 1);
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	p = prfind((lxpnp->lxpr_pid == 1) ?
+	    curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+	int ceof;
+	proc_t *p;
+	int fddirsize = -1;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL)
+		return (ENOENT);
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas))
+		fddirsize = 0;
+
+	/*
+	 * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+	 * going away while we iterate over its fi_list.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	if (fddirsize == -1)
+		fddirsize = fip->fi_nfiles;
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		goto out;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd < 0 || fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+	return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/* must be a symbolic link file */
+	if (vp->v_type != VLNK)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+			return (error);
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process
+			 */
+			pid = ((curproc->p_pid !=
+			    curproc->p_zone->zone_proc_initpid)
+			    ? curproc->p_pid : 1);
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..e3718372e0
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,277 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+#ifdef _LXPROC_BRANDED_H
+#error Attempted to include native lxproc.h after branded lx_proc.h
+#endif
+
+#ifndef	_LXPROC_H
+#define	_LXPROC_H
+#define	_LXPROC_NATIVE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define	LX_SIGHUP	1
+#define	LX_SIGINT	2
+#define	LX_SIGQUIT	3
+#define	LX_SIGILL	4
+#define	LX_SIGTRAP	5
+#define	LX_SIGABRT	6
+#define	LX_SIGIOT	6
+#define	LX_SIGBUS	7
+#define	LX_SIGFPE	8
+#define	LX_SIGKILL	9
+#define	LX_SIGUSR1	10
+#define	LX_SIGSEGV	11
+#define	LX_SIGUSR2	12
+#define	LX_SIGPIPE	13
+#define	LX_SIGALRM	14
+#define	LX_SIGTERM	15
+#define	LX_SIGSTKFLT	16
+#define	LX_SIGCHLD	17
+#define	LX_SIGCONT	18
+#define	LX_SIGSTOP	19
+#define	LX_SIGTSTP	20
+#define	LX_SIGTTIN	21
+#define	LX_SIGTTOU	22
+#define	LX_SIGURG	23
+#define	LX_SIGXCPU	24
+#define	LX_SIGXFSZ	25
+#define	LX_SIGVTALRM	26
+#define	LX_SIGPROF	27
+#define	LX_SIGWINCH	28
+#define	LX_SIGIO	29
+#define	LX_SIGPOLL	LX_SIGIO
+#define	LX_SIGPWR	30
+#define	LX_SIGSYS	31
+#define	LX_SIGUNUSED	31
+
+#define	LX_NSIG		64	/* Linux _NSIG */
+
+#define	LX_SIGRTMIN	32
+#define	LX_SIGRTMAX	LX_NSIG
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define	LXPRMAXNAMELEN	14
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index 207a708771..2176dcb9de 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index b7354c168a..cd1e08d5d7 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -29,7 +29,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index ce0c9485a6..3ee41939ac 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #include <sys/systm.h>
@@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head,
 	kex = &exi->exi_export;
 	kex->ex_flags = EX_PSEUDO;
 
-	vpathlen = vp->v_path ? strlen(vp->v_path) : 0;
+	vpathlen = strlen(vp->v_path);
 	kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX);
 	kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP);
 
 	if (vpathlen)
-		(void) strcpy(kex->ex_path, vp->v_path);
+		(void) strncpy(kex->ex_path, vp->v_path, vpathlen);
 	(void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX);
 
 	/* Transfer the secinfo data from exdata to this new pseudo node */
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index 151cb62403..55f6c95289 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d6bf384a8b..2b3fdfdd55 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -34,7 +34,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/param.h>
@@ -8061,8 +8061,9 @@ link_call:
 	 * vnode if it already existed.
 	 */
 	if (error == 0) {
-		vnode_t *tvp;
+		vnode_t *tvp, *tovp;
 		rnode4_t *trp;
+
 		/*
 		 * Notify the vnode. Each links is represented by
 		 * a different vnode, in nfsv4.
@@ -8075,23 +8076,20 @@ link_call:
 			vnevent_rename_dest(tvp, ndvp, nnm, ct);
 		}
 
-		/*
-		 * if the source and destination directory are not the
-		 * same notify the destination directory.
-		 */
-		if (VTOR4(odvp) != VTOR4(ndvp)) {
-			trp = VTOR4(ndvp);
-			tvp = ndvp;
-			if (IS_SHADOW(ndvp, trp))
-				tvp = RTOV4(trp);
-			vnevent_rename_dest_dir(tvp, ct);
-		}
-
 		trp = VTOR4(ovp);
-		tvp = ovp;
+		tovp = ovp;
 		if (IS_SHADOW(ovp, trp))
+			tovp = RTOV4(trp);
+
+		vnevent_rename_src(tovp, odvp, onm, ct);
+
+		trp = VTOR4(ndvp);
+		tvp = ndvp;
+
+		if (IS_SHADOW(ndvp, trp))
 			tvp = RTOV4(trp);
-		vnevent_rename_src(tvp, odvp, onm, ct);
+
+		vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
 	}
 
 	if (nvp) {
diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c
index 3410340581..a8bcfdf438 100644
--- a/usr/src/uts/common/fs/nfs/nfs_auth.c
+++ b/usr/src/uts/common/fs/nfs/nfs_auth.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/param.h>
@@ -559,11 +560,16 @@ retry:
 			*access = res.ares.auth_perm;
 			*srv_uid = res.ares.auth_srv_uid;
 			*srv_gid = res.ares.auth_srv_gid;
-			*srv_gids_cnt = res.ares.auth_srv_gids.len;
-			*srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t),
-			    KM_SLEEP);
-			bcopy(res.ares.auth_srv_gids.val, *srv_gids,
-			    *srv_gids_cnt * sizeof (gid_t));
+
+			if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) {
+				*srv_gids = kmem_alloc(*srv_gids_cnt *
+				    sizeof (gid_t), KM_SLEEP);
+				bcopy(res.ares.auth_srv_gids.val, *srv_gids,
+				    *srv_gids_cnt * sizeof (gid_t));
+			} else {
+				*srv_gids = NULL;
+			}
+
 			break;
 
 		case NFSAUTH_DR_EFAIL:
@@ -1114,9 +1120,13 @@ wait:
 		if (gid != NULL)
 			*gid = p->auth_srv_gid;
 		if (ngids != NULL && gids != NULL) {
-			*ngids = p->auth_srv_ngids;
-			*gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP);
-			bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t));
+			if ((*ngids = p->auth_srv_ngids) != 0) {
+				size_t sz = *ngids * sizeof (gid_t);
+				*gids = kmem_alloc(sz, KM_SLEEP);
+				bcopy(p->auth_srv_gids, *gids, sz);
+			} else {
+				*gids = NULL;
+			}
 		}
 
 		access = p->auth_access;
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index 7e94c62734..dcc64def59 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*
@@ -2570,6 +2571,9 @@ nfs_srvinit(void)
 {
 	int error;
 
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (EACCES);
+
 	error = nfs_exportinit();
 	if (error != 0)
 		return (error);
@@ -3208,7 +3212,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi)
 	char *path;
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		zone = zone_find_by_any_path(vp->v_path, B_FALSE);
 		mutex_exit(&vp->v_lock);
 	} else {
diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
index 57b21778b4..ffd5380a86 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  *
  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *	All rights reserved.
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 1a1082bcb8..8c321c8aa3 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -26,7 +26,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -2688,11 +2688,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
 		if (nvp)
 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
 
-		if (odvp != ndvp)
-			vnevent_rename_dest_dir(ndvp, ct);
-
 		ASSERT(ovp != NULL);
 		vnevent_rename_src(ovp, odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
 	}
 
 	if (nvp) {
diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c
index a9ee604b7c..21a0b1a4bd 100644
--- a/usr/src/uts/common/fs/pcfs/pc_dir.c
+++ b/usr/src/uts/common/fs/pcfs/pc_dir.c
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/systm.h>
@@ -822,10 +826,10 @@ top:
 			return (error);
 		}
 	}
-out:
-	vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
-	if (dp != tdp) {
-		vnevent_rename_dest_dir(PCTOV(tdp), ctp);
+
+	if (error == 0) {
+		vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
+		vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp);
 	}
 
 	VN_RELE(PCTOV(pcp));
diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c
index 14be8cbbae..11b7386269 100644
--- a/usr/src/uts/common/fs/portfs/port.c
+++ b/usr/src/uts/common/fs/portfs/port.c
@@ -24,7 +24,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -1381,12 +1383,18 @@ portnowait:
 
 	if (model == DATAMODEL_NATIVE) {
 		eventsz = sizeof (port_event_t);
-		kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
-		if (kevp == NULL) {
-			if (nmax > pp->port_max_list)
-				nmax = pp->port_max_list;
-			kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+		if (nmax == 0) {
+			kevp = NULL;
+		} else {
+			kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+			if (kevp == NULL) {
+				if (nmax > pp->port_max_list)
+					nmax = pp->port_max_list;
+				kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+			}
 		}
+
 		results = kevp;
 		lev = NULL;	/* start with first event in the queue */
 		for (nevents = 0; nevents < nmax; ) {
@@ -1423,12 +1431,18 @@ portnowait:
 		port_event32_t	*kevp32;
 
 		eventsz = sizeof (port_event32_t);
-		kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
-		if (kevp32 == NULL) {
-			if (nmax > pp->port_max_list)
-				nmax = pp->port_max_list;
-			kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+		if (nmax == 0) {
+			kevp32 = NULL;
+		} else {
+			kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+			if (kevp32 == NULL) {
+				if (nmax > pp->port_max_list)
+					nmax = pp->port_max_list;
+				kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+			}
 		}
+
 		results = kevp32;
 		lev = NULL;	/* start with first event in the queue */
 		for (nevents = 0; nevents < nmax; ) {
diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c
index b2f5088e06..ab95c0a1f8 100644
--- a/usr/src/uts/common/fs/portfs/port_vnops.c
+++ b/usr/src/uts/common/fs/portfs/port_vnops.c
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/vfs_opreg.h>
@@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
 		levents |= POLLOUT;
 	levents &= events;
 	*reventsp = levents;
-	if (levents == 0) {
-		if (!anyyet) {
-			*phpp = &pp->port_pollhd;
-			portq->portq_flags |=
-			    events & POLLIN ? PORTQ_POLLIN : 0;
-			portq->portq_flags |=
-			    events & POLLOUT ? PORTQ_POLLOUT : 0;
-		}
+	if ((levents == 0 && !anyyet) || (events & POLLET)) {
+		*phpp = &pp->port_pollhd;
+		portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0;
+		portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0;
 	}
 	mutex_exit(&portq->portq_mutex);
 	return (0);
diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c
new file mode 100644
index 0000000000..a4ad82d661
--- /dev/null
+++ b/usr/src/uts/common/fs/proc/prargv.c
@@ -0,0 +1,441 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+
+/*
+ * Safely read a contiguous region of memory from 'addr' in the address space
+ * of a particular process into the supplied kernel buffer (*buf, sz).
+ * Partially mapped regions will result in a partial read terminating at the
+ * first hole in the address space.  The number of bytes actually read is
+ * returned to the caller via 'rdsz'.
+ */
+static int
+prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz)
+{
+	int error = 0;
+	size_t rem = sz;
+	off_t pos = 0;
+
+	if (rdsz != NULL)
+		*rdsz = 0;
+
+	while (rem != 0) {
+		uintptr_t addr = ustart + pos;
+		size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+		if ((error = uread(p, buf + pos, len, addr)) != 0) {
+			if (error == ENXIO) {
+				/*
+				 * ENXIO from uread() indicates that the page
+				 * does not exist.  This will simply be a
+				 * partial read.
+				 */
+				error = 0;
+			}
+			break;
+		}
+
+		rem -= len;
+		pos += len;
+	}
+
+	if (rdsz != NULL)
+		*rdsz = pos;
+
+	return (error);
+}
+
+/*
+ * Attempt to read the argument vector (argv) from this process.  The caller
+ * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via
+ * prlock or lx_prlock).
+ *
+ * The caller must provide a buffer (buf, buflen).  We will concatenate each
+ * argument string (including the NUL terminator) into this buffer.  The number
+ * of characters written to this buffer (including the final NUL terminator)
+ * will be stored in 'slen'.
+ */
+int
+prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *argv = NULL;
+	size_t argvsz = 0;
+	int i;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) {
+		/*
+		 * Return the regular psargs string to the caller.
+		 */
+		bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+		buf[bufsz - 1] = '\0';
+		*slen = strlen(buf) + 1;
+
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store argv array.
+	 */
+	argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	argv = kmem_alloc(argvsz, KM_SLEEP);
+
+	/*
+	 * Extract the argv array from the target process.  Drop p_lock
+	 * while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+	if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz,
+	    NULL)) != 0) {
+		kmem_free(argv, argvsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each argument string from the pointers in the argv array.
+	 */
+	pos = 0;
+	for (i = 0; i < up->u_argc; i++) {
+		size_t rdsz, trysz;
+		uintptr_t arg;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			arg = (uintptr_t)((caddr32_t *)argv)[i];
+		} else {
+			arg = (uintptr_t)argv[i];
+		}
+#else
+		arg = (uintptr_t)argv[i];
+#endif
+
+		/*
+		 * Stop trying to read arguments if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (arg == NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual argument strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this argument.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(argv, argvsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
+
+/*
+ * Similar to prreadargv except reads the env vector. This is slightly more
+ * complex because there is no count for the env vector that corresponds to
+ * u_argc.
+ */
+int
+prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+	int error;
+	user_t *up;
+	struct as *as;
+	size_t pos = 0;
+	caddr_t *envp = NULL;
+	uintptr_t tmpp = NULL;
+	size_t envpsz = 0, rdsz = 0;
+	int i;
+	int cnt, bound;
+
+	VERIFY(MUTEX_HELD(&p->p_lock));
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+	up = PTOU(p);
+	as = p->p_as;
+
+	if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) {
+		/*
+		 * Return empty string.
+		 */
+		buf[0] = '\0';
+		*slen = 1;
+
+		return (0);
+	}
+
+	/*
+	 * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+	 */
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * We first have to count how many env entries we have. This is
+	 * somewhat painful. We extract the env entries from the target process
+	 * one entry at a time. Stop trying to read env entries if we reach a
+	 * NULL pointer in the vector or hit our upper bound (which we take
+	 * as the bufsz/4) to ensure we don't run off.
+	 */
+	rdsz = (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	bound = (int)(bufsz / 4);
+	for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) {
+		caddr_t tmp = NULL;
+
+		if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz,
+		    NULL)) != 0) {
+			mutex_enter(&p->p_lock);
+			VERIFY(p->p_proc_flag & P_PR_LOCK);
+			return (-1);
+		}
+
+		if (tmp == NULL)
+			break;
+	}
+	if (cnt == 0) {
+		/* Return empty string. */
+		buf[0] = '\0';
+		*slen = 1;
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (0);
+	}
+
+	/*
+	 * Allocate space to store env array.
+	 */
+	envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ?
+	    sizeof (caddr32_t) : sizeof (caddr_t));
+	envp = kmem_alloc(envpsz, KM_SLEEP);
+
+	/*
+	 * Extract the env array from the target process.
+	 */
+	if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz,
+	    NULL)) != 0) {
+		kmem_free(envp, envpsz);
+		mutex_enter(&p->p_lock);
+		VERIFY(p->p_proc_flag & P_PR_LOCK);
+		return (-1);
+	}
+
+	/*
+	 * Read each env string from the pointers in the env array.
+	 */
+	pos = 0;
+	for (i = 0; i < cnt; i++) {
+		size_t rdsz, trysz;
+		uintptr_t ev;
+		off_t j;
+		boolean_t found_nul;
+		boolean_t do_retry = B_TRUE;
+
+#ifdef	_SYSCALL32_IMPL
+		if (p->p_model == DATAMODEL_ILP32) {
+			ev = (uintptr_t)((caddr32_t *)envp)[i];
+		} else {
+			ev = (uintptr_t)envp[i];
+		}
+#else
+		ev = (uintptr_t)envp[i];
+#endif
+
+		/*
+		 * Stop trying to read env entries if we reach a NULL
+		 * pointer in the vector.
+		 */
+		if (ev == NULL)
+			break;
+
+		/*
+		 * Stop reading if we have read the maximum length
+		 * we can return to the user.
+		 */
+		if (pos >= bufsz)
+			break;
+
+		/*
+		 * Initially we try a short read, on the assumption that
+		 * most individual env strings are less than 80
+		 * characters long.
+		 */
+		if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+			/*
+			 * We don't have room in the target buffer for even
+			 * an entire short read, so there is no need to retry
+			 * with a longer read.
+			 */
+			do_retry = B_FALSE;
+		}
+
+retry:
+		/*
+		 * Read string data for this env var.  Leave room
+		 * in the buffer for a final NUL terminator.
+		 */
+		if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz,
+		    &rdsz)) != 0) {
+			/*
+			 * There was a problem reading this string
+			 * from the process.  Give up.
+			 */
+			break;
+		}
+
+		/*
+		 * Find the NUL terminator.
+		 */
+		found_nul = B_FALSE;
+		for (j = 0; j < rdsz; j++) {
+			if (buf[pos + j] == '\0') {
+				found_nul = B_TRUE;
+				break;
+			}
+		}
+
+		if (!found_nul && do_retry) {
+			/*
+			 * We did not find a NUL terminator, but this
+			 * was a first pass short read.  Try once more
+			 * with feeling.
+			 */
+			trysz = bufsz - pos - 1;
+			do_retry = B_FALSE;
+			goto retry;
+		}
+
+		/*
+		 * Commit the string we read to the buffer.
+		 */
+		pos += j + 1;
+		if (!found_nul && pos < bufsz) {
+			/*
+			 * A NUL terminator was not found; add one.
+			 */
+			buf[pos++] = '\0';
+		}
+	}
+
+	/*
+	 * Ensure the entire string is NUL-terminated.
+	 */
+	buf[bufsz - 1] = '\0';
+
+	mutex_enter(&p->p_lock);
+	VERIFY(p->p_proc_flag & P_PR_LOCK);
+	kmem_free(envp, envpsz);
+
+	/*
+	 * If the operation was a success, return the copied string length
+	 * to the caller.
+	 */
+	*slen = (error == 0) ? pos : 0;
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index a73a64a4a4..7e99d23b97 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
 		} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
 			/* If SIGKILL, set stopped lwp running */
 			p->p_stopsig = 0;
-			t->t_schedflag |= TS_XSTART | TS_PSTART;
+			t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
 			t->t_dtrace_stop = 0;
 			setrun_locked(t);
 		}
@@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr)
 		return (EPERM);
 	if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id)
 		return (EINVAL);
-	if ((zptr = zone_find_by_id(zoneid)) == NULL)
-		return (EINVAL);
+	/*
+	 * We cannot hold p_lock when we call zone_find_by_id since that can
+	 * lead to a deadlock. zone_find_by_id() takes zonehash_lock.
+	 * zone_enter() can hold the zonehash_lock and needs p_lock when it
+	 * calls task_join.
+	 */
 	mutex_exit(&p->p_lock);
+	if ((zptr = zone_find_by_id(zoneid)) == NULL) {
+		mutex_enter(&p->p_lock);
+		return (EINVAL);
+	}
 	mutex_enter(&p->p_crlock);
 	oldcred = p->p_cred;
 	crhold(oldcred);
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 8ea516bf82..72f26b3c05 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -27,7 +27,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef _SYS_PROC_PRDATA_H
@@ -123,6 +123,7 @@ typedef enum prnodetype {
 #if defined(__i386) || defined(__amd64)
 	PR_LDT,			/* /proc/<pid>/ldt			*/
 #endif
+	PR_ARGV,		/* /proc/<pid>/argv			*/
 	PR_USAGE,		/* /proc/<pid>/usage			*/
 	PR_LUSAGE,		/* /proc/<pid>/lusage			*/
 	PR_PAGEDATA,		/* /proc/<pid>/pagedata			*/
@@ -347,6 +348,8 @@ extern	int	pr_unset(proc_t *, long);
 extern	void	pr_sethold(prnode_t *, sigset_t *);
 extern	void	pr_setfault(proc_t *, fltset_t *);
 extern	int	prusrio(proc_t *, enum uio_rw, struct uio *, int);
+extern	int	prreadargv(proc_t *, char *, size_t, size_t *);
+extern	int	prreadenvv(proc_t *, char *, size_t, size_t *);
 extern	int	prwritectl(vnode_t *, struct uio *, cred_t *);
 extern	int	prlock(prnode_t *, int);
 extern	void	prunmark(proc_t *);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index 7801fd0ac8..284bf8cb88 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -201,6 +201,7 @@ prchoose(proc_t *p)
 			case PR_SYSEXIT:
 			case PR_SIGNALLED:
 			case PR_FAULTED:
+			case PR_BRAND:
 				/*
 				 * Make an lwp calling exit() be the
 				 * last lwp seen in the process.
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index 411c9b8b0b..276f54ae3a 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1984,	 1986, 1987, 1988, 1989 AT&T	*/
@@ -96,6 +96,11 @@ struct prdirect {
 #define	PRSDSIZE	(sizeof (struct prdirect))
 
 /*
+ * Maximum length of the /proc/$$/argv file:
+ */
+int prmaxargvlen = 4096;
+
+/*
  * Directory characteristics.
  */
 typedef struct prdirent {
@@ -166,6 +171,8 @@ static prdirent_t piddir[] = {
 	{ PR_LDT,	27 * sizeof (prdirent_t), sizeof (prdirent_t),
 		"ldt" },
 #endif
+	{ PR_ARGV,	28 * sizeof (prdirent_t), sizeof (prdirent_t),
+		"argv" },
 };
 
 #define	NPIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]) - 2)
@@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(),
 #if defined(__x86)
 	pr_read_ldt(),
 #endif
+	pr_read_argv(),
 	pr_read_usage(), pr_read_lusage(), pr_read_pagedata(),
 	pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(),
 	pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(),
@@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
 	pr_read_usage,		/* /proc/<pid>/usage			*/
 	pr_read_lusage,		/* /proc/<pid>/lusage			*/
 	pr_read_pagedata,	/* /proc/<pid>/pagedata			*/
@@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop)
 }
 
 static int
+pr_read_argv(prnode_t *pnp, uio_t *uiop)
+{
+	char *args;
+	int error;
+	size_t asz = prmaxargvlen, sz;
+
+	/*
+	 * Allocate a scratch buffer for collection of the process arguments.
+	 */
+	args = kmem_alloc(asz, KM_SLEEP);
+
+	ASSERT(pnp->pr_type == PR_ARGV);
+
+	if ((error = prlock(pnp, ZNO)) != 0) {
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz,
+	    &sz)) != 0) {
+		prunlock(pnp);
+		kmem_free(args, asz);
+		return (error);
+	}
+
+	prunlock(pnp);
+
+	error = pr_uioread(args, sz, uiop);
+
+	kmem_free(args, asz);
+
+	return (error);
+}
+
+static int
 pr_read_as(prnode_t *pnp, uio_t *uiop)
 {
 	int error;
@@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = {
 #if defined(__x86)
 	pr_read_ldt,		/* /proc/<pid>/ldt			*/
 #endif
+	pr_read_argv,		/* /proc/<pid>/argv			*/
 	pr_read_usage_32,	/* /proc/<pid>/usage			*/
 	pr_read_lusage_32,	/* /proc/<pid>/lusage			*/
 	pr_read_pagedata_32,	/* /proc/<pid>/pagedata			*/
@@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 #endif
 }
 
+/*
+ * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile
+ * time that PRFNSZ has the same definition as MAXCOMLEN.
+ */
+#if PRFNSZ != MAXCOMLEN
+#error PRFNSZ/MAXCOMLEN mismatch
+#endif
+
+static int
+pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop)
+{
+	char fname[PRFNSZ];
+	int offset = offsetof(psinfo_t, pr_fname), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_fname);
+#endif
+
+	/*
+	 * If this isn't a write to pr_fname (or if the size doesn't match
+	 * PRFNSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ)
+		return (0);
+
+	if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	fname[PRFNSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+/*
+ * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile
+ * time that PRARGSZ has the same definition as PSARGSZ.
+ */
+#if PRARGSZ != PSARGSZ
+#error PRARGSZ/PSARGSZ mismatch
+#endif
+
+static int
+pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop)
+{
+	char psargs[PRARGSZ];
+	int offset = offsetof(psinfo_t, pr_psargs), error;
+
+#ifdef _SYSCALL32_IMPL
+	if (curproc->p_model != DATAMODEL_LP64)
+		offset = offsetof(psinfo32_t, pr_psargs);
+#endif
+
+	/*
+	 * If this isn't a write to pr_psargs (or if the size doesn't match
+	 * PRARGSZ) return.
+	 */
+	if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ)
+		return (0);
+
+	if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0)
+		return (error);
+
+	psargs[PRARGSZ - 1] = '\0';
+
+	if ((error = prlock(pnp, ZNO)) != 0)
+		return (error);
+
+	bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ);
+
+	prunlock(pnp);
+
+	return (0);
+}
+
+int
+pr_write_psinfo(prnode_t *pnp, uio_t *uiop)
+{
+	int error;
+
+	if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0)
+		return (error);
+
+	if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0)
+		return (error);
+
+	return (0);
+}
+
+
 /* ARGSUSED */
 static int
 prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 			uiop->uio_resid = resid;
 		return (error);
 
+	case PR_PSINFO:
+		return (pr_write_psinfo(pnp, uiop));
+
 	default:
 		return ((vp->v_type == VDIR)? EISDIR : EBADF);
 	}
@@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	case PR_AUXV:
 		vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t);
 		break;
+	case PR_ARGV:
+		if ((p->p_flag & SSYS) || p->p_as == &kas) {
+			vap->va_size = PSARGSZ;
+		} else {
+			vap->va_size = prmaxargvlen;
+		}
+		break;
 #if defined(__x86)
 	case PR_LDT:
 		mutex_exit(&p->p_lock);
@@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
 		p = pr_p_lock(pnp);
 		mutex_exit(&pr_pidlock);
 		if (p == NULL)
@@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_lookup_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_lookup_notdir,	/* /proc/<pid>/argv			*/
 	pr_lookup_notdir,	/* /proc/<pid>/usage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/lusage			*/
 	pr_lookup_notdir,	/* /proc/<pid>/pagedata			*/
@@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type)
 		break;
 
 	case PR_PSINFO:
+		pnp->pr_mode = 0644;	/* readable by all + owner can write */
+		break;
+
 	case PR_LPSINFO:
 	case PR_LWPSINFO:
 	case PR_USAGE:
 	case PR_LUSAGE:
 	case PR_LWPUSAGE:
+	case PR_ARGV:
 		pnp->pr_mode = 0444;	/* read-only by all */
 		break;
 
@@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = {
 #if defined(__x86)
 	pr_readdir_notdir,	/* /proc/<pid>/ldt			*/
 #endif
+	pr_readdir_notdir,	/* /proc/<pid>/argv			*/
 	pr_readdir_notdir,	/* /proc/<pid>/usage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/lusage			*/
 	pr_readdir_notdir,	/* /proc/<pid>/pagedata			*/
@@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp)
 			case PR_PROCDIR:
 			case PR_PSINFO:
 			case PR_USAGE:
+			case PR_ARGV:
 				break;
 			default:
 				continue;
@@ -6010,7 +6170,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp,
 	}
 
 	*reventsp = revents;
-	if (!anyyet && revents == 0) {
+	if ((!anyyet && revents == 0) || (events & POLLET)) {
 		/*
 		 * Arrange to wake up the polling lwp when
 		 * the target process/lwp stops or terminates
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index 703e26ea61..682f1d867b 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
 
+	so->so_krecv_cb = NULL;
+	so->so_krecv_arg = NULL;
+
 	return (0);
 }
 
@@ -654,6 +658,10 @@ sonode_fini(struct sonode *so)
 	if (so->so_filter_top != NULL)
 		sof_sonode_cleanup(so);
 
+	/* Clean up any remnants of krecv callbacks */
+	so->so_krecv_cb = NULL;
+	so->so_krecv_arg = NULL;
+
 	ASSERT(list_is_empty(&so->so_acceptq_list));
 	ASSERT(list_is_empty(&so->so_acceptq_defer));
 	ASSERT(!list_link_active(&so->so_acceptq_node));
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 0be628f329..40c368736d 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr));
 
 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
 
@@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
 
 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
 
-	if ((so->so_mode & SM_SENDFILESUPP) == 0) {
-		SO_UNBLOCK_FALLBACK(so);
-		return (EOPNOTSUPP);
-	}
-
 	error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
 	    B_FALSE);
 
@@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
 {
 	int error;
 
-	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+	SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
 
 	if (so->so_filter_active == 0 ||
 	    (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
@@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_getsockopt(so, option_name, optval, optlenp, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
 
 	if ((so->so_filter_active == 0 ||
@@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name,
 	if (level == SOL_FILTER)
 		return (sof_setsockopt(so, option_name, optval, optlen, cr));
 
-	SO_BLOCK_FALLBACK(so,
+	SO_BLOCK_FALLBACK_SAFE(so,
 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
 
 	/* X/Open requires this check */
@@ -953,6 +948,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
 	if (!list_is_empty(&so->so_acceptq_list))
 		*reventsp |= (POLLIN|POLLRDNORM) & events;
 
+	/*
+	 * If we're looking for POLLRDHUP, indicate it if we have sent the
+	 * last rx signal for the socket.
+	 */
+	if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG))
+		*reventsp |= POLLRDHUP;
+
 	/* Data */
 	/* so_downcalls is null for sctp */
 	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
@@ -988,14 +990,20 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
 			*reventsp |= POLLHUP;
 	}
 
-	if (!*reventsp && !anyyet) {
+	if ((!*reventsp && !anyyet) || (events & POLLET)) {
 		/* Check for read events again, but this time under lock */
 		if (events & (POLLIN|POLLRDNORM)) {
 			mutex_enter(&so->so_lock);
 			if (SO_HAVE_DATA(so) ||
 			    !list_is_empty(&so->so_acceptq_list)) {
+				if (events & POLLET) {
+					so->so_pollev |= SO_POLLEV_IN;
+					*phpp = &so->so_poll_list;
+				}
+
 				mutex_exit(&so->so_lock);
 				*reventsp |= (POLLIN|POLLRDNORM) & events;
+
 				return (0);
 			} else {
 				so->so_pollev |= SO_POLLEV_IN;
@@ -1316,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp,
 		}
 	}
 
+	mutex_enter(&so->so_lock);
+	if (so->so_krecv_cb != NULL) {
+		boolean_t cont;
+		so_krecv_f func = so->so_krecv_cb;
+		void *arg = so->so_krecv_arg;
+
+		mutex_exit(&so->so_lock);
+		cont = func(so, mp, msg_size, flags & MSG_OOB, arg);
+		mutex_enter(&so->so_lock);
+		if (cont == B_TRUE) {
+			space_left = so->so_rcvbuf;
+		} else {
+			so->so_rcv_queued = so->so_rcvlowat;
+			*errorp = ENOSPC;
+			space_left = -1;
+		}
+		goto done_unlock;
+	}
+	mutex_exit(&so->so_lock);
+
 	if (flags & MSG_OOB) {
 		so_queue_oob(so, mp, msg_size);
 		mutex_enter(&so->so_lock);
@@ -1594,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 		return (ENOTCONN);
 	}
 
+	mutex_enter(&so->so_lock);
+	if (so->so_krecv_cb != NULL) {
+		mutex_exit(&so->so_lock);
+		return (EOPNOTSUPP);
+	}
+	mutex_exit(&so->so_lock);
+
 	if (msg->msg_flags & MSG_PEEK)
 		msg->msg_flags &= ~MSG_WAITALL;
 
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index 957c8f93b4..9604ea5dba 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -2276,9 +2277,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
 
 	/*
-	 * Cannot fallback if the socket has active filters
+	 * Cannot fallback if the socket has active filters or a krecv callback.
 	 */
-	if (so->so_filter_active > 0)
+	if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
 		return (EINVAL);
 
 	switch (so->so_family) {
@@ -2456,3 +2457,50 @@ out:
 
 	return (error);
 }
+
+int
+so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
+{
+	int ret;
+
+	if (cb == NULL && arg != NULL)
+		return (EINVAL);
+
+	SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
+
+	mutex_enter(&so->so_lock);
+	if (so->so_state & SS_FALLBACK_COMP) {
+		mutex_exit(&so->so_lock);
+		SO_UNBLOCK_FALLBACK(so);
+		return (ENOTSUP);
+	}
+
+	ret = so_lock_read(so, 0);
+	VERIFY(ret == 0);
+	/*
+	 * Other consumers may actually care about getting extant data delivered
+	 * to them, when they come along, they should figure out the best API
+	 * for that.
+	 */
+	so_rcv_flush(so);
+
+	so->so_krecv_cb = cb;
+	so->so_krecv_arg = arg;
+
+	so_unlock_read(so);
+	mutex_exit(&so->so_lock);
+	SO_UNBLOCK_FALLBACK(so);
+
+	return (0);
+}
+
+void
+so_krecv_unblock(sonode_t *so)
+{
+	mutex_enter(&so->so_lock);
+	VERIFY(so->so_krecv_cb != NULL);
+
+	so->so_rcv_queued = 0;
+	(void) so_check_flow_control(so);
+	mutex_exit(&so->so_lock);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index 971523945e..7dca6ae6fc 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/systm.h>
@@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name,
 
 		/* Module loaded OK, so there must be an ops vector */
 		ASSERT(ent->sofe_mod != NULL);
+
+		/*
+		 * Check again to confirm ATTACH is ok. See if the the module
+		 * is not SOF_ATT_SAFE after an unsafe operation has taken
+		 * place.
+		 */
+		if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 &&
+		    so->so_state & SS_FILOP_UNSF) {
+			sof_instance_destroy(inst);
+			return (EINVAL);
+		}
+
 		inst->sofi_ops = &ent->sofe_mod->sofm_ops;
 
 		SOF_STAT_ADD(inst, tot_active_attach, 1);
@@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
  * sof_register(version, name, ops, flags)
  *
  * Register a socket filter identified by name `name' and which should use
- * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * the ops vector `ops' for event notification. `flags' should be set to 0
+ * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An
+ * unsafe filter is one that cannot be attached after any socket operation has
+ * occured. This is the legacy default. A "safe" filter can be attached even
+ * after some basic initial socket operations have taken place. This set is
+ * currently bind, getsockname, getsockopt and setsockopt. The order in which
+ * a "safe" filter can be attached is more relaxed, and thus more flexible.
  * On success 0 is returned, otherwise an errno is returned.
  */
 int
@@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
 {
 	sof_module_t *mod;
 
-	_NOTE(ARGUNUSED(flags));
-
 	if (version != SOF_VERSION)
 		return (EINVAL);
 
 	mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
 	mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 	(void) strcpy(mod->sofm_name, name);
+	mod->sofm_flags = flags;
 	mod->sofm_ops = *ops;
 
 	mutex_enter(&sof_module_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index 7f7aece1f1..cf2ad8b20d 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_SOCKFS_SOCKFILTER_H
@@ -51,6 +52,7 @@ typedef struct sof_kstat	sof_kstat_t;
 
 struct sof_module {
 	char		*sofm_name;
+	int		sofm_flags;
 	sof_ops_t	sofm_ops;
 	uint_t		sofm_refcnt;
 	list_node_t	sofm_node;
diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c
index 3d5ba2a7e8..3f858afecc 100644
--- a/usr/src/uts/common/fs/sockfs/socknotify.c
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c
@@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev)
 		so->so_state |= SS_SENTLASTREADSIG;
 		so->so_pollev &= ~SO_POLLEV_IN;
 
-		*pollev |= POLLIN|POLLRDNORM;
+		*pollev |= POLLIN|POLLRDNORM|POLLRDHUP;
 		*sigev |= SOCKETSIG_READ;
 
 		return (1);
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 06d76044e5..095d84fc42 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -1879,7 +1880,7 @@ ssize_t
 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
 {
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec aiov[1];
 	register vnode_t *vp;
 	int ioflag, rwflag;
 	ssize_t cnt;
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index a86cda937c..067eaedd7c 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -21,10 +21,10 @@
 
 /*
  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.  All rights reserved.
  */
 
-/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
-
 #include <sys/types.h>
 #include <sys/t_lock.h>
 #include <sys/param.h>
@@ -51,6 +51,7 @@
 #include <sys/cmn_err.h>
 #include <sys/vmsystm.h>
 #include <sys/policy.h>
+#include <sys/limits.h>
 
 #include <sys/socket.h>
 #include <sys/socketvar.h>
@@ -83,12 +84,6 @@ extern void	nl7c_init(void);
 extern int	sockfs_defer_nl7c_init;
 
 /*
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- *	 as there isn't a formal definition of IOV_MAX ???
- */
-#define	MSG_MAXIOVLEN	16
-
-/*
  * Kernel component of socket creation.
  *
  * The socket library determines which version number to use.
@@ -1023,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 	STRUCT_HANDLE(nmsghdr, umsgptr);
 	struct nmsghdr lmsg;
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	ssize_t iovsize = 0;
 	int iovcnt;
-	ssize_t len;
+	ssize_t len, rval;
 	int i;
 	int *flagsp;
 	model_t	model;
@@ -1068,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 
 	iovcnt = lmsg.msg_iovlen;
 
-	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+	if (iovcnt <= 0 || iovcnt > IOV_MAX) {
 		return (set_errno(EMSGSIZE));
 	}
 
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		aiov = kmem_alloc(iovsize, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded, while ensuring
 	 * that they can't move more than 2Gbytes of data in a single call.
 	 */
 	if (model == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[MSG_MAXIOVLEN];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		ssize_t iov32size;
 		ssize32_t count32;
 
-		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
-		    iovcnt * sizeof (struct iovec32)))
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0)
+			aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
+		if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(aiov32, iov32size);
+				kmem_free(aiov, iovsize);
+			}
+
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
@@ -1091,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 
 			iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov32, iov32size);
+					kmem_free(aiov, iovsize);
+				}
+
 				return (set_errno(EINVAL));
+			}
+
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+
+		if (iovsize != 0)
+			kmem_free(aiov32, iov32size);
 	} else
 #endif /* _SYSCALL32_IMPL */
 	if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 	len = 0;
@@ -1107,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 		ssize_t iovlen = aiov[i].iov_len;
 		len += iovlen;
 		if (iovlen < 0 || len < 0) {
+			if (iovsize != 0)
+				kmem_free(aiov, iovsize);
+
 			return (set_errno(EINVAL));
 		}
 	}
@@ -1121,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
 	    (do_useracc == 0 ||
 	    useracc(lmsg.msg_control, lmsg.msg_controllen,
 	    B_WRITE) != 0)) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 
-	return (recvit(sock, &lmsg, &auio, flags,
+	rval = recvit(sock, &lmsg, &auio, flags,
 	    STRUCT_FADDR(umsgptr, msg_namelen),
-	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
+	    STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
+
+	if (iovsize != 0)
+		kmem_free(aiov, iovsize);
+
+	return (rval);
 }
 
 /*
@@ -1264,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 	struct nmsghdr lmsg;
 	STRUCT_DECL(nmsghdr, u_lmsg);
 	struct uio auio;
-	struct iovec aiov[MSG_MAXIOVLEN];
+	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+	ssize_t iovsize = 0;
 	int iovcnt;
-	ssize_t len;
+	ssize_t len, rval;
 	int i;
 	model_t	model;
 
@@ -1309,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 
 	iovcnt = lmsg.msg_iovlen;
 
-	if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+	if (iovcnt <= 0 || iovcnt > IOV_MAX) {
 		/*
 		 * Unless this is XPG 4.2 we allow iovcnt == 0 to
 		 * be compatible with SunOS 4.X and 4.4BSD.
@@ -1318,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 			return (set_errno(EMSGSIZE));
 	}
 
+	if (iovcnt > IOV_MAX_STACK) {
+		iovsize = iovcnt * sizeof (struct iovec);
+		aiov = kmem_alloc(iovsize, KM_SLEEP);
+	}
+
 #ifdef _SYSCALL32_IMPL
 	/*
 	 * 32-bit callers need to have their iovec expanded, while ensuring
 	 * that they can't move more than 2Gbytes of data in a single call.
 	 */
 	if (model == DATAMODEL_ILP32) {
-		struct iovec32 aiov32[MSG_MAXIOVLEN];
+		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+		ssize_t iov32size;
 		ssize32_t count32;
 
+		iov32size = iovcnt * sizeof (struct iovec32);
+		if (iovsize != 0)
+			aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
 		if (iovcnt != 0 &&
-		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
-		    iovcnt * sizeof (struct iovec32)))
+		    copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+			if (iovsize != 0) {
+				kmem_free(aiov32, iov32size);
+				kmem_free(aiov, iovsize);
+			}
+
 			return (set_errno(EFAULT));
+		}
 
 		count32 = 0;
 		for (i = 0; i < iovcnt; i++) {
@@ -1338,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 
 			iovlen32 = aiov32[i].iov_len;
 			count32 += iovlen32;
-			if (iovlen32 < 0 || count32 < 0)
+			if (iovlen32 < 0 || count32 < 0) {
+				if (iovsize != 0) {
+					kmem_free(aiov32, iov32size);
+					kmem_free(aiov, iovsize);
+				}
+
 				return (set_errno(EINVAL));
+			}
+
 			aiov[i].iov_len = iovlen32;
 			aiov[i].iov_base =
 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
 		}
+
+		if (iovsize != 0)
+			kmem_free(aiov32, iov32size);
 	} else
 #endif /* _SYSCALL32_IMPL */
 	if (iovcnt != 0 &&
 	    copyin(lmsg.msg_iov, aiov,
 	    (unsigned)iovcnt * sizeof (struct iovec))) {
+		if (iovsize != 0)
+			kmem_free(aiov, iovsize);
+
 		return (set_errno(EFAULT));
 	}
 	len = 0;
@@ -1356,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 		ssize_t iovlen = aiov[i].iov_len;
 		len += iovlen;
 		if (iovlen < 0 || len < 0) {
+			if (iovsize != 0)
+				kmem_free(aiov, iovsize);
+
 			return (set_errno(EINVAL));
 		}
 	}
@@ -1366,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_limit = 0;
 
-	return (sendit(sock, &lmsg, &auio, flags));
+	rval = sendit(sock, &lmsg, &auio, flags);
+
+	if (iovsize != 0)
+		kmem_free(aiov, iovsize);
+
+	return (rval);
 }
 
 ssize_t
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index d33f53f7a6..485e73eb02 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -6272,6 +6272,13 @@ sotpi_poll(
 	if (sti->sti_conn_ind_head != NULL)
 		*reventsp |= (POLLIN|POLLRDNORM) & events;
 
+	if (so->so_state & SS_CANTRCVMORE) {
+		*reventsp |= POLLRDHUP & events;
+
+		if (so->so_state & SS_CANTSENDMORE)
+			*reventsp |= POLLHUP;
+	}
+
 	if (so->so_state & SS_OOBPEND)
 		*reventsp |= POLLRDBAND & events;
 
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
 		 * memory that can be used as swap space should do so by
 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
 		 * However, swapfs_minfree is tunable by install as a
-		 * workaround for bugid 1147463.
+		 * workaround for bugid 1147463. Note swapfs_minfree is set
+		 * to 1/8th of memory, but clamped at the limit of 256 MB.
 		 */
-		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+		new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+		    btopr(256 * 1024 * 1024));
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index f6621c8097..387cc6ae54 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -516,7 +516,7 @@ tdirdelete(
 	 */
 	namelen = strlen(tpdp->td_name) + 1;
 
-	tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
+	kmem_free(tpdp, sizeof (struct tdirent) + namelen);
 	dir->tn_size -= (sizeof (struct tdirent) + namelen);
 	dir->tn_dirents--;
 
@@ -549,8 +549,8 @@ tdirinit(
 	ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
 	ASSERT(dir->tn_type == VDIR);
 
-	dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
-	dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
+	dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP);
+	dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP);
 
 	/*
 	 * Initialize the entries
@@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir)
 
 		tmpfs_hash_out(tdp);
 
-		tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
+		kmem_free(tdp, sizeof (struct tdirent) + namelen);
 		dir->tn_size -= (sizeof (struct tdirent) + namelen);
 		dir->tn_dirents--;
 	}
@@ -925,7 +925,7 @@ tdiraddentry(
 	 */
 	namelen = strlen(name) + 1;
 	alloc_size = namelen + sizeof (struct tdirent);
-	tdp = tmp_memalloc(alloc_size, 0);
+	tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
 	if (tdp == NULL)
 		return (ENOSPC);
 
@@ -1025,7 +1025,7 @@ tdirmaketnode(
 	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
 		return (EOVERFLOW);
 	type = va->va_type;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 	tmpnode_init(tm, tp, va, cred);
 
 	/* setup normal file/dir's extended attribute directory */
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
index 2e59d28d80..e6e2b392fe 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -40,9 +41,19 @@
 #include <sys/policy.h>
 #include <sys/fs/tmp.h>
 #include <sys/fs/tmpnode.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#define	KILOBYTE	1024
+#define	MEGABYTE	(1024 * KILOBYTE)
+#define	GIGABYTE	(1024 * MEGABYTE)
 
 #define	MODESHIFT	3
 
+#define	VALIDMODEBITS	07777
+
+extern pgcnt_t swapfs_minfree;
+
 int
 tmp_taccess(void *vtp, int mode, struct cred *cred)
 {
@@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred)
  * a plain file and you have write access to that file.
  * Function returns 0 if remove access is granted.
  */
-
 int
 tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
 	struct cred *cr)
@@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
 }
 
 /*
- * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
- * or the 'musthave' flag is set.  'musthave' allocations should
- * always be subordinate to normal allocations so that tmpfs_maxkmem
- * can't be exceeded by more than a few KB.  Example: when creating
- * a new directory, the tmpnode is a normal allocation; if that
- * succeeds, the dirents for "." and ".." are 'musthave' allocations.
- */
-void *
-tmp_memalloc(size_t size, int musthave)
-{
-	static time_t last_warning;
-	time_t now;
-
-	if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
-	    musthave)
-		return (kmem_zalloc(size, KM_SLEEP));
-
-	atomic_add_long(&tmp_kmemspace, -size);
-	now = gethrestime_sec();
-	if (last_warning != now) {
-		last_warning = now;
-		cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
-	}
-	return (NULL);
-}
-
-void
-tmp_memfree(void *cp, size_t size)
-{
-	kmem_free(cp, size);
-	atomic_add_long(&tmp_kmemspace, -size);
-}
-
-/*
  * Convert a string containing a number (number of bytes) to a pgcnt_t,
  * containing the corresponding number of pages. On 32-bit kernels, the
  * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
  * returned in 'maxpg' is at most ULONG_MAX.
  *
- * If the number is followed by a "k" or "K", the value is converted from
- * kilobytes to bytes.  If it is followed by an "m" or "M" it is converted
- * from megabytes to bytes.  If it is not followed by a character it is
- * assumed to be in bytes. Multiple letter options are allowed, so for instance
- * '2mk' is interpreted as 2gb.
+ * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes;
+ * "m" or "M" for megabytes; "g" or "G" for gigabytes.  This interface allows
+ * for an arguably esoteric interpretation of multiple suffix characters:
+ * namely, they cascade.  For example, the caller may specify "2mk", which is
+ * interpreted as 2 gigabytes.  It would seem, at this late stage, that the
+ * horse has left not only the barn but indeed the country, and possibly the
+ * entire planetary system. Alternatively, the number may be followed by a
+ * single '%' sign, indicating the size is a percentage of either the zone's
+ * swap limit or the system's overall swap size.
  *
  * Parse and overflow errors are detected and a non-zero number returned on
  * error.
  */
-
 int
 tmp_convnum(char *str, pgcnt_t *maxpg)
 {
-	uint64_t num = 0, oldnum;
+	u_longlong_t num = 0;
 #ifdef _LP64
-	uint64_t max_bytes = ULONG_MAX;
+	u_longlong_t max_bytes = ULONG_MAX;
 #else
-	uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
+	u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
 #endif
 	char *c;
-
-	if (str == NULL)
+	const struct convchar {
+		char *cc_char;
+		uint64_t cc_factor;
+	} convchars[] = {
+		{ "kK", KILOBYTE },
+		{ "mM", MEGABYTE },
+		{ "gG", GIGABYTE },
+		{ NULL, 0 }
+	};
+
+	if (str == NULL) {
 		return (EINVAL);
+	}
 	c = str;
 
 	/*
-	 * Convert str to number
+	 * Convert the initial numeric portion of the input string.
 	 */
-	while ((*c >= '0') && (*c <= '9')) {
-		oldnum = num;
-		num = num * 10 + (*c++ - '0');
-		if (oldnum > num) /* overflow */
+	if (ddi_strtoull(str, &c, 10, &num) != 0) {
+		return (EINVAL);
+	}
+
+	/*
+	 * Handle a size in percent. Anything other than a single percent
+	 * modifier is invalid. We use either the zone's swap limit or the
+	 * system's total available swap size as the initial value. Perform the
+	 * intermediate calculation in pages to avoid overflow.
+	 */
+	if (*c == '\%') {
+		u_longlong_t cap;
+
+		if (*(c + 1) != '\0')
+			return (EINVAL);
+
+		if (num > 100)
 			return (EINVAL);
+
+		cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl;
+		if (cap == UINT64_MAX) {
+			/*
+			 * Use the amount of available physical and memory swap
+			 */
+			mutex_enter(&anoninfo_lock);
+			cap = TOTAL_AVAILABLE_SWAP;
+			mutex_exit(&anoninfo_lock);
+		} else {
+			cap = btop(cap);
+		}
+
+		num = ptob(cap * num / 100);
+		goto done;
 	}
 
 	/*
-	 * Terminate on null
+	 * Apply the (potentially cascading) magnitude suffixes until an
+	 * invalid character is found, or the string comes to an end.
 	 */
-	while (*c != '\0') {
-		switch (*c++) {
+	for (; *c != '\0'; c++) {
+		int i;
+
+		for (i = 0; convchars[i].cc_char != NULL; i++) {
+			/*
+			 * Check if this character matches this multiplier
+			 * class:
+			 */
+			if (strchr(convchars[i].cc_char, *c) != NULL) {
+				/*
+				 * Check for overflow:
+				 */
+				if (num > max_bytes / convchars[i].cc_factor) {
+					return (EINVAL);
+				}
+
+				num *= convchars[i].cc_factor;
+				goto valid_char;
+			}
+		}
 
 		/*
-		 * convert from kilobytes
+		 * This was not a valid multiplier suffix character.
 		 */
-		case 'k':
-		case 'K':
-			if (num > max_bytes / 1024) /* will overflow */
-				return (EINVAL);
-			num *= 1024;
-			break;
+		return (EINVAL);
 
-		/*
-		 * convert from megabytes
-		 */
-		case 'm':
-		case 'M':
-			if (num > max_bytes / (1024 * 1024)) /* will overflow */
-				return (EINVAL);
-			num *= 1024 * 1024;
-			break;
-
-		default:
-			return (EINVAL);
-		}
+valid_char:
+		continue;
 	}
 
+done:
 	/*
 	 * Since btopr() rounds up to page granularity, this round-up can
 	 * cause an overflow only if 'num' is between (max_bytes - PAGESIZE)
@@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg)
 		return (EINVAL);
 	return (0);
 }
+
+/*
+ * Parse an octal mode string for use as the permissions set for the root
+ * of the tmpfs mount.
+ */
+int
+tmp_convmode(char *str, mode_t *mode)
+{
+	ulong_t num;
+	char *c;
+
+	if (str == NULL) {
+		return (EINVAL);
+	}
+
+	if (ddi_strtoul(str, &c, 8, &num) != 0) {
+		return (EINVAL);
+	}
+
+	if ((num & ~VALIDMODEBITS) != 0) {
+		return (EINVAL);
+	}
+
+	*mode = VALIDMODEBITS & num;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..3c088c442c 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +56,15 @@
 static int tmpfsfstype;
 
 /*
+ * tmpfs_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. With forced umount support, the
+ * filesystem module must not be allowed to go away before the last
+ * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
+ * there's no need for locking.
+ */
+static uint32_t	tmpfs_mountcount;
+
+/*
  * tmpfs vfs operations.
  */
 static int tmpfsinit(int, char *);
@@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *);
 static int tmp_root(struct vfs *, struct vnode **);
 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
+static void tmp_freevfs(vfs_t *vfsp);
 
 /*
  * Loadable module wrapper
@@ -76,7 +87,7 @@ static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	"tmpfs",
 	tmpfsinit,
-	VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
 	&tmpfs_proto_opttbl
 };
 
@@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = {
 	/* Option name		Cancel Opt	Arg	Flags		Data */
 	{ MNTOPT_XATTR,		xattr_cancel,	NULL,	MO_DEFAULT,	NULL},
 	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,	NULL,		NULL},
-	{ "size",		NULL,		"0",	MO_HASVALUE,	NULL}
+	{ "size",		NULL,		"0",	MO_HASVALUE,	NULL},
+	{ "mode",		NULL,		NULL,	MO_HASVALUE,	NULL}
 };
 
 
@@ -121,6 +133,14 @@ _fini()
 {
 	int error;
 
+	/*
+	 * If a forceably unmounted instance is still hanging around, we cannot
+	 * allow the module to be unloaded because that would cause panics once
+	 * the VFS framework decides it's time to call into VFS_FREEVFS().
+	 */
+	if (tmpfs_mountcount)
+		return (EBUSY);
+
 	error = mod_remove(&modlinkage);
 	if (error)
 		return (error);
@@ -139,14 +159,6 @@ _info(struct modinfo *modinfop)
 }
 
 /*
- * The following are patchable variables limiting the amount of system
- * resources tmpfs can use.
- *
- * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
- * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
- * It is not determined by setting a hard limit but rather as a percentage of
- * physical memory which is determined when tmpfs is first used in the system.
- *
  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
  * the rest of the system.  In other words, if the amount of free swap space
  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
@@ -155,9 +167,7 @@ _info(struct modinfo *modinfop)
  * There is also a per mount limit on the amount of swap space
  * (tmount.tm_anonmax) settable via a mount option.
  */
-size_t tmpfs_maxkmem = 0;
 size_t tmpfs_minfree = 0;
-size_t tmp_kmemspace;		/* bytes of kernel heap used by all tmpfs */
 
 static major_t tmpfs_major;
 static minor_t tmpfs_minor;
@@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name)
 		VFSNAME_ROOT,		{ .vfs_root = tmp_root },
 		VFSNAME_STATVFS,	{ .vfs_statvfs = tmp_statvfs },
 		VFSNAME_VGET,		{ .vfs_vget = tmp_vget },
+		VFSNAME_FREEVFS,	{ .vfs_freevfs = tmp_freevfs },
 		NULL,			NULL
 	};
 	int error;
@@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name)
 		tmpfs_minfree = btopr(TMPMINFREE);
 	}
 
-	/*
-	 * The maximum amount of space tmpfs can allocate is
-	 * TMPMAXPROCKMEM percent of kernel memory
-	 */
-	if (tmpfs_maxkmem == 0)
-		tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
-
 	if ((tmpfs_major = getudev()) == (major_t)-1) {
 		cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
 		tmpfs_major = 0;
 	}
 	mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	tmpfs_mountcount = 0;
 	return (0);
 }
 
 static int
-tmp_mount(
-	struct vfs *vfsp,
-	struct vnode *mvp,
-	struct mounta *uap,
-	struct cred *cr)
+tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 {
 	struct tmount *tm = NULL;
 	struct tmpnode *tp;
@@ -239,8 +240,9 @@ tmp_mount(
 	pgcnt_t anonmax;
 	struct vattr rattr;
 	int got_attrs;
-
-	char *sizestr;
+	boolean_t mode_arg = B_FALSE;
+	mode_t root_mode = 0777;
+	char *argstr;
 
 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 		return (error);
@@ -249,7 +251,7 @@ tmp_mount(
 		return (ENOTDIR);
 
 	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_OVERLAY) == 0 &&
+	if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (EBUSY);
@@ -275,18 +277,45 @@ tmp_mount(
 	 * tm_anonmax is set according to the mount arguments
 	 * if any.  Otherwise, it is set to a maximum value.
 	 */
-	if (vfs_optionisset(vfsp, "size", &sizestr)) {
-		if ((error = tmp_convnum(sizestr, &anonmax)) != 0)
+	if (vfs_optionisset(vfsp, "size", &argstr)) {
+		if ((error = tmp_convnum(argstr, &anonmax)) != 0)
 			goto out;
 	} else {
 		anonmax = ULONG_MAX;
 	}
 
+	/*
+	 * The "mode" mount argument allows the operator to override the
+	 * permissions of the root of the tmpfs mount.
+	 */
+	if (vfs_optionisset(vfsp, "mode", &argstr)) {
+		if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
+			goto out;
+		}
+		mode_arg = B_TRUE;
+	}
+
 	if (error = pn_get(uap->dir,
 	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
 		goto out;
 
-	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
+	if (uap->flags & MS_REMOUNT) {
+		tm = (struct tmount *)VFSTOTM(vfsp);
+
+		/*
+		 * If we change the size so its less than what is currently
+		 * being used, we allow that. The file system will simply be
+		 * full until enough files have been removed to get below the
+		 * new max.
+		 */
+		mutex_enter(&tm->tm_contents);
+		tm->tm_anonmax = anonmax;
+		mutex_exit(&tm->tm_contents);
+		goto out;
+	}
+
+	if ((tm = kmem_zalloc(sizeof (struct tmount),
+	    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 		pn_free(&dpn);
 		error = ENOMEM;
 		goto out;
@@ -318,17 +347,17 @@ tmp_mount(
 	vfsp->vfs_bsize = PAGESIZE;
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
-	tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
+	tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
 	(void) strcpy(tm->tm_mntpath, dpn.pn_path);
 
 	/*
 	 * allocate and initialize root tmpnode structure
 	 */
 	bzero(&rattr, sizeof (struct vattr));
-	rattr.va_mode = (mode_t)(S_IFDIR | 0777);	/* XXX modes */
+	rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
 	rattr.va_type = VDIR;
 	rattr.va_rdev = 0;
-	tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+	tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 	tmpnode_init(tm, tp, &rattr, cr);
 
 	/*
@@ -345,7 +374,14 @@ tmp_mount(
 	 * the previously set hardwired defaults to prevail.
 	 */
 	if (got_attrs == 0) {
-		tp->tn_mode = rattr.va_mode;
+		if (!mode_arg) {
+			/*
+			 * Only use the underlying mount point for the
+			 * mode if the "mode" mount argument was not
+			 * provided.
+			 */
+			tp->tn_mode = rattr.va_mode;
+		}
 		tp->tn_uid = rattr.va_uid;
 		tp->tn_gid = rattr.va_gid;
 	}
@@ -366,6 +402,7 @@ tmp_mount(
 
 	pn_free(&dpn);
 	error = 0;
+	atomic_inc_32(&tmpfs_mountcount);
 
 out:
 	if (error == 0)
@@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	struct tmpnode *tnp, *cancel;
 	struct vnode	*vp;
 	int error;
+	uint_t cnt;
+	int i;
 
 	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (error);
 
-	/*
-	 * forced unmount is not supported by this file system
-	 * and thus, ENOTSUP, is being returned.
-	 */
-	if (flag & MS_FORCE)
-		return (ENOTSUP);
-
 	mutex_enter(&tm->tm_contents);
 
 	/*
-	 * If there are no open files, only the root node should have
-	 * a reference count.
+	 * In the normal unmount case (non-forced unmount), if there are no
+	 * open files, only the root node should have a reference count.
+	 *
 	 * With tm_contents held, nothing can be added or removed.
 	 * There may be some dirty pages.  To prevent fsflush from
 	 * disrupting the unmount, put a hold on each node while scanning.
 	 * If we find a previously referenced node, undo the holds we have
 	 * placed and fail EBUSY.
+	 *
+	 * However, in the case of a forced umount, things are a bit different.
+	 * An additional VFS_HOLD is added for each outstanding VN_HOLD to
+	 * ensure that the file system is not cleaned up (tmp_freevfs) until
+	 * the last vfs hold is dropped. This happens in tmp_inactive as the
+	 * vnodes are released. Also, we can't add an additional VN_HOLD in
+	 * this case since that would prevent tmp_inactive from ever being
+	 * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
+	 * so that the zone is not blocked waiting for the final file system
+	 * cleanup.
 	 */
 	tnp = tm->tm_rootnode;
-	if (TNTOV(tnp)->v_count > 1) {
+
+	vp = TNTOV(tnp);
+	mutex_enter(&vp->v_lock);
+	cnt = vp->v_count;
+	if (flag & MS_FORCE) {
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		/* Extra hold which we rele below when we drop the zone ref */
+		VFS_HOLD(vfsp);
+
+		for (i = 1; i < cnt; i++)
+			VFS_HOLD(vfsp);
+
+		/* drop the mutex now because no one can find this mount */
+		mutex_exit(&tm->tm_contents);
+	} else if (cnt > 1) {
+		mutex_exit(&vp->v_lock);
 		mutex_exit(&tm->tm_contents);
 		return (EBUSY);
 	}
+	mutex_exit(&vp->v_lock);
 
+	/*
+	 * Check for open files. An open file causes everything to unwind
+	 * unless this is a forced umount.
+	 */
 	for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
-		if ((vp = TNTOV(tnp))->v_count > 0) {
+		vp = TNTOV(tnp);
+		mutex_enter(&vp->v_lock);
+		cnt = vp->v_count;
+		if (flag & MS_FORCE) {
+			for (i = 0; i < cnt; i++)
+				VFS_HOLD(vfsp);
+
+			/*
+			 * In the case of a forced umount don't add an
+			 * additional VN_HOLD on the already held vnodes, like
+			 * we do in the non-forced unmount case. If the
+			 * cnt > 0, then the vnode already has at least one
+			 * hold and we need tmp_inactive to get called when the
+			 * last pre-existing hold on the node is released so
+			 * that we can VFS_RELE the VFS holds we just added.
+			 */
+			if (cnt == 0) {
+				/* directly add VN_HOLD since have the lock */
+				vp->v_count++;
+			}
+
+			mutex_exit(&vp->v_lock);
+
+			/*
+			 * If the tmpnode has any pages associated with it
+			 * (i.e. if it's a normal file with non-zero size), the
+			 * tmpnode could still be discovered by pageout or
+			 * fsflush via the page vnode pointers. To prevent this
+			 * from interfering with the tmp_freevfs, truncate the
+			 * tmpnode now.
+			 */
+			if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
+				rw_enter(&tnp->tn_rwlock, RW_WRITER);
+				rw_enter(&tnp->tn_contents, RW_WRITER);
+
+				(void) tmpnode_trunc(tm, tnp, 0);
+
+				rw_exit(&tnp->tn_contents);
+				rw_exit(&tnp->tn_rwlock);
+
+				ASSERT(tnp->tn_size == 0);
+				ASSERT(tnp->tn_nblocks == 0);
+			}
+		} else if (cnt > 0) {
+			/* An open file; unwind the holds we've been adding. */
+			mutex_exit(&vp->v_lock);
 			cancel = tm->tm_rootnode->tn_forw;
 			while (cancel != tnp) {
 				vp = TNTOV(cancel);
@@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 			}
 			mutex_exit(&tm->tm_contents);
 			return (EBUSY);
+		} else {
+			/* directly add a VN_HOLD since we have the lock */
+			vp->v_count++;
+			mutex_exit(&vp->v_lock);
 		}
-		VN_HOLD(vp);
 	}
 
-	/*
-	 * We can drop the mutex now because no one can find this mount
-	 */
-	mutex_exit(&tm->tm_contents);
+	if (flag & MS_FORCE) {
+		/*
+		 * Drop the zone ref now since we don't know how long it will
+		 * be until the final vfs_rele is called by tmp_inactive.
+		 */
+		if (vfsp->vfs_zone) {
+			zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
+			    ZONE_REF_VFS);
+			vfsp->vfs_zone = 0;
+		}
+		/* We can now drop the extra hold we added above. */
+		VFS_RELE(vfsp);
+	} else {
+		/*
+		 * For the non-forced case, we can drop the mutex now because
+		 * no one can find this mount anymore
+		 */
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		mutex_exit(&tm->tm_contents);
+	}
+
+	return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
+ * the vfs framework after umount and the last VFS_RELE, to trigger the release
+ * of any resources still associated with the given vfs_t. We only add
+ * additional VFS_HOLDs during the forced umount case, so this is normally
+ * called immediately after tmp_umount.
+ */
+void
+tmp_freevfs(vfs_t *vfsp)
+{
+	struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
+	struct tmpnode *tnp;
+	struct vnode	*vp;
 
 	/*
 	 * Free all kmemalloc'd and anonalloc'd memory associated with
@@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	 * tmpnode_free which assumes that the directory entry has been
 	 * removed before the file.
 	 */
+
+	/*
+	 * Now that we are tearing ourselves down we need to remove the
+	 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+	 * files from the system causing us to have a negative value. Doing this
+	 * seems a bit better than trying to set a flag on the tmount that says
+	 * we're tearing down.
+	 */
+	vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
 	/*
 	 * Remove all directory entries
 	 */
@@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 
 	ASSERT(tm->tm_mntpath);
 
-	tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+	kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 
 	ASSERT(tm->tm_anonmem == 0);
 
 	mutex_destroy(&tm->tm_contents);
 	mutex_destroy(&tm->tm_renamelck);
-	tmp_memfree(tm, sizeof (struct tmount));
+	kmem_free(tm, sizeof (struct tmount));
 
-	return (0);
+	/* Allow _fini() to succeed now */
+	atomic_dec_32(&tmpfs_mountcount);
 }
 
 /*
@@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 	 * available to tmpfs.  This is fairly inaccurate since it doesn't
 	 * take into account the names stored in the directory entries.
 	 */
-	if (tmpfs_maxkmem > tmp_kmemspace)
-		sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
-		    (sizeof (struct tmpnode) + sizeof (struct tdirent));
-	else
-		sbp->f_ffree = 0;
-
-	sbp->f_files = tmpfs_maxkmem /
+	sbp->f_ffree = sbp->f_files = ptob(availrmem) /
 	    (sizeof (struct tmpnode) + sizeof (struct tdirent));
 	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
 	(void) cmpldev(&d32, vfsp->vfs_dev);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index 61607a6dcc..464d638db0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support reading non-regular files
 	 */
@@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 	struct tmount *tm = (struct tmount *)VTOTM(vp);
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	/*
 	 * We don't currently support writing to non-regular files
 	 */
@@ -833,6 +841,9 @@ tmp_lookup(
 	struct tmpnode *ntp = NULL;
 	int error;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
 
 	/* allow cd into @ dir */
 	if (flags & LOOKUP_XATTR) {
@@ -871,8 +882,7 @@ tmp_lookup(
 				return (error);
 			}
 
-			xdp = tmp_memalloc(sizeof (struct tmpnode),
-			    TMP_MUSTHAVE);
+			xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
 			tm = VTOTM(dvp);
 			tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
 			/*
@@ -1186,7 +1196,7 @@ tmp_rename(
 	struct tmpnode *toparent;
 	struct tmpnode *fromtp = NULL;	/* source tmpnode */
 	struct tmount *tm = (struct tmount *)VTOTM(odvp);
-	int error;
+	int error, err;
 	int samedir = 0;	/* set if odvp == ndvp */
 	struct vnode *realvp;
 
@@ -1262,15 +1272,6 @@ tmp_rename(
 			error = 0;
 		goto done;
 	}
-	vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
-
-	/*
-	 * Notify the target directory if not same as
-	 * source directory.
-	 */
-	if (ndvp != odvp) {
-		vnevent_rename_dest_dir(ndvp, ct);
-	}
 
 	/*
 	 * Unlink from source.
@@ -1278,7 +1279,7 @@ tmp_rename(
 	rw_enter(&fromparent->tn_rwlock, RW_WRITER);
 	rw_enter(&fromtp->tn_rwlock, RW_WRITER);
 
-	error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
+	error = err = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
 
 	/*
 	 * The following handles the case where our source tmpnode was
@@ -1293,6 +1294,12 @@ tmp_rename(
 
 	rw_exit(&fromtp->tn_rwlock);
 	rw_exit(&fromparent->tn_rwlock);
+
+	if (err == 0) {
+		vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
+		vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
+	}
+
 done:
 	tmpnode_rele(fromtp);
 	mutex_exit(&tm->tm_renamelck);
@@ -1459,6 +1466,10 @@ tmp_readdir(
 	int reclen;
 	caddr_t outbuf;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (uiop->uio_loffset >= MAXOFF_T) {
 		if (eofp)
 			*eofp = 1;
@@ -1597,7 +1608,7 @@ tmp_symlink(
 		return (error);
 	}
 	len = strlen(tnm) + 1;
-	cp = tmp_memalloc(len, 0);
+	cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI);
 	if (cp == NULL) {
 		tmpnode_rele(self);
 		return (ENOSPC);
@@ -1662,10 +1673,27 @@ top:
 	 * there's little to do -- just drop our hold.
 	 */
 	if (vp->v_count > 1 || tp->tn_nlink != 0) {
-		vp->v_count--;
+		if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
+			/*
+			 * Since the file system was forcibly unmounted, we can
+			 * have a case (v_count == 1, tn_nlink != 0) where this
+			 * file was open so we didn't add an extra hold on the
+			 * file in tmp_unmount. We are counting on the
+			 * interaction of the hold made in tmp_unmount and
+			 * rele-ed in tmp_vfsfree so we need to be sure we
+			 * don't decrement in this case.
+			 */
+			if (vp->v_count > 1)
+				vp->v_count--;
+		} else {
+			vp->v_count--;
+		}
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&tp->tn_tlock);
 		rw_exit(&tp->tn_rwlock);
+		/* If the filesystem was umounted by force, rele the vfs ref */
+		if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+			VFS_RELE(tm->tm_vfsp);
 		return;
 	}
 
@@ -1690,7 +1718,7 @@ top:
 			goto top;
 		}
 		if (tp->tn_type == VLNK)
-			tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
+			kmem_free(tp->tn_symlink, tp->tn_size + 1);
 	}
 
 	/*
@@ -1724,7 +1752,11 @@ top:
 	rw_destroy(&tp->tn_rwlock);
 	mutex_destroy(&tp->tn_tlock);
 	vn_free(TNTOV(tp));
-	tmp_memfree(tp, sizeof (struct tmpnode));
+	kmem_free(tp, sizeof (struct tmpnode));
+
+	/* If the filesystem was umounted by force, rele the vfs ref */
+	if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+		VFS_RELE(tm->tm_vfsp);
 }
 
 /* ARGSUSED2 */
@@ -1846,6 +1878,10 @@ tmp_getapage(
 	struct vnode *pvp;
 	u_offset_t poff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	if (protp != NULL)
 		*protp = PROT_ALL;
 again:
@@ -2067,6 +2103,10 @@ tmp_putapage(
 	u_offset_t offset;
 	u_offset_t tmpoff;
 
+	/* If the filesystem was umounted by force, return immediately. */
+	if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+		return (EIO);
+
 	ASSERT(PAGE_LOCKED(pp));
 
 	/* Kluster in tmp_klustsize chunks */
diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c
index c1e2c74a87..def046a0bf 100644
--- a/usr/src/uts/common/fs/udfs/udf_dir.c
+++ b/usr/src/uts/common/fs/udfs/udf_dir.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -562,9 +563,8 @@ out:
 				    namep, ctp);
 			}
 
-			if (sdp != tdp) {
-				vnevent_rename_dest_dir(ITOV(tdp), ctp);
-			}
+			vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip),
+			    namep, ctp);
 		}
 
 		/*
diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c
index 307d3987ed..2d8de23399 100644
--- a/usr/src/uts/common/fs/udfs/udf_vnops.c
+++ b/usr/src/uts/common/fs/udfs/udf_vnops.c
@@ -911,7 +911,7 @@ udf_rename(
 	caller_context_t *ct,
 	int flags)
 {
-	int32_t error = 0;
+	int32_t error = 0, err;
 	struct udf_vfs *udf_vfsp;
 	struct ud_inode *sip;		/* source inode */
 	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
@@ -995,7 +995,6 @@ udf_rename(
 		rw_exit(&tdp->i_rwlock);
 		goto errout;
 	}
-	vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
 	rw_exit(&tdp->i_rwlock);
 
 	rw_enter(&sdp->i_rwlock, RW_WRITER);
@@ -1006,11 +1005,15 @@ udf_rename(
 	 * If the entry has changed just forget about it.  Release
 	 * the source inode.
 	 */
-	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
+	if ((error = err = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
 	    DR_RENAME, cr, ct)) == ENOENT) {
 		error = 0;
 	}
 	rw_exit(&sdp->i_rwlock);
+
+	if (err == 0)
+		vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
+
 errout:
 	ITIMES(sdp);
 	ITIMES(tdp);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index fcffd952ed..c77872b11d 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -3367,11 +3367,10 @@ ufs_rename(
 	struct inode *ip = NULL;	/* check inode */
 	struct inode *sdp;		/* old (source) parent inode */
 	struct inode *tdp;		/* new (target) parent inode */
-	struct vnode *svp = NULL;	/* source vnode */
 	struct vnode *tvp = NULL;	/* target vnode, if it exists */
 	struct vnode *realvp;
 	struct ufsvfs *ufsvfsp;
-	struct ulockfs *ulp;
+	struct ulockfs *ulp = NULL;
 	struct ufs_slot slot;
 	timestruc_t now;
 	int error;
@@ -3380,7 +3379,7 @@ ufs_rename(
 	krwlock_t *first_lock;
 	krwlock_t *second_lock;
 	krwlock_t *reverse_lock;
-	int serr, terr;
+	int terr;
 
 	sdp = VTOI(sdvp);
 	slot.fbp = NULL;
@@ -3389,34 +3388,13 @@ ufs_rename(
 	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
 		tdvp = realvp;
 
+	/* Must do this before taking locks in case of DNLC miss */
 	terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
-	serr = ufs_eventlookup(sdvp, snm, cr, &svp);
-
-	if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
-		if (tvp != NULL)
-			vnevent_rename_dest(tvp, tdvp, tnm, ct);
-
-		/*
-		 * Notify the target directory of the rename event
-		 * if source and target directories are not the same.
-		 */
-		if (sdvp != tdvp)
-			vnevent_rename_dest_dir(tdvp, ct);
-
-		if (svp != NULL)
-			vnevent_rename_src(svp, sdvp, snm, ct);
-	}
-
-	if (tvp != NULL)
-		VN_RELE(tvp);
-
-	if (svp != NULL)
-		VN_RELE(svp);
 
 retry_rename:
 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
 	if (error)
-		goto out;
+		goto unlock;
 
 	if (ulp)
 		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
@@ -3712,6 +3690,9 @@ retry_firstlock:
 		goto errout;
 	}
 
+	if (terr == 0 && tvp != NULL)
+		vnevent_rename_dest(tvp, tdvp, tnm, ct);
+
 	/*
 	 * Unlink the source.
 	 * Remove the source entry.  ufs_dirremove() checks that the entry
@@ -3723,6 +3704,9 @@ retry_firstlock:
 	    DR_RENAME, cr)) == ENOENT)
 		error = 0;
 
+	vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
+	vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
+
 errout:
 	if (slot.fbp)
 		fbrelse(slot.fbp, S_OTHER);
@@ -3732,15 +3716,17 @@ errout:
 		rw_exit(&sdp->i_rwlock);
 	}
 
-	VN_RELE(ITOV(sip));
-
 unlock:
+	if (tvp != NULL)
+		VN_RELE(tvp);
+	if (sip != NULL)
+		VN_RELE(ITOV(sip));
+
 	if (ulp) {
 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
 		ufs_lockfs_end(ulp);
 	}
 
-out:
 	return (error);
 }
 
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index 6e93d056df..b680b1168c 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -235,7 +235,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp)
 	 * Make sure this root has a path.  With lofs, it is possible to have
 	 * a NULL mountpoint.
 	 */
-	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
+	if (ret == 0 && vfsp->vfs_mntpt != NULL &&
+	    (*vpp)->v_path == vn_vpath_empty) {
 		mntpt = vfs_getmntpoint(vfsp);
 		vn_setpath_str(*vpp, refstr_value(mntpt),
 		    strlen(refstr_value(mntpt)));
@@ -905,6 +906,7 @@ vfs_mountroot(void)
 	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
 	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
 	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
+	vfs_mountfs("bootfs", "bootfs", "/system/boot");
 
 	if (getzoneid() == GLOBAL_ZONEID) {
 		vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
@@ -3899,6 +3901,8 @@ vfs_to_modname(const char *vfstype)
 		vfstype = "fdfs";
 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
 		vfstype = "nfs";
+	} else if (strcmp(vfstype, "lxproc") == 0) {
+		vfstype = "lxprocfs";
 	}
 
 	return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 4abb040de0..9fb1ea702a 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -66,6 +66,7 @@
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
+#include <sys/time.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
@@ -102,6 +103,9 @@ kmutex_t	vskstat_tree_lock;
 /* Global variable which enables/disables the vopstats collection */
 int vopstats_enabled = 1;
 
+/* Global used for empty/invalid v_path */
+char *vn_vpath_empty = "";
+
 /*
  * forward declarations for internal vnode specific data (vsd)
  */
@@ -200,6 +204,11 @@ static void 		(**vsd_destructor)(void *);
 		cr = crgetmapped(cr);					\
 	}
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
@@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
-	vp->v_path = NULL;
+	vp->v_path = vn_vpath_empty;
 	vp->v_mpssdata = NULL;
 	vp->v_vsd = NULL;
 	vp->v_fopdata = NULL;
@@ -2331,6 +2340,7 @@ void
 vn_recycle(vnode_t *vp)
 {
 	ASSERT(vp->v_pages == NULL);
+	VERIFY(vp->v_path != NULL);
 
 	/*
 	 * XXX - This really belongs in vn_reinit(), but we have some issues
@@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp)
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
-	if (vp->v_path) {
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
 
 	if (vp->v_fopdata != NULL) {
@@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp)
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
-	if (vp->v_path != NULL) {
+	VERIFY(vp->v_path != NULL);
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
 
 	/* If FEM was in use, make sure everything gets cleaned up */
@@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
+	(void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
 }
 
@@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
 }
 
 void
-vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
+    caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
-	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
 }
 
 void
@@ -2951,7 +2964,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	 * the potential for deadlock.
 	 */
 	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL) {
+	if (base->v_path == vn_vpath_empty) {
 		mutex_exit(&base->v_lock);
 		return;
 	}
@@ -2978,7 +2991,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
 
 	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
+	if (base->v_path == vn_vpath_empty ||
+	    strlen(base->v_path) != rpathlen) {
 		mutex_exit(&base->v_lock);
 		kmem_free(rpath, rpathalloc);
 		return;
@@ -2992,7 +3006,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
 	rpath[rpathlen + plen] = '\0';
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(rpath, rpathalloc);
 	} else {
@@ -3012,7 +3026,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len)
 	char *buf = kmem_alloc(len + 1, KM_SLEEP);
 
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path != vn_vpath_empty) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(buf, len + 1);
 		return;
@@ -3036,10 +3050,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
 
 	mutex_enter(&vp->v_lock);
 	tmp = vp->v_path;
-	vp->v_path = NULL;
+	vp->v_path = vn_vpath_empty;
 	mutex_exit(&vp->v_lock);
 	vn_setpath(rootdir, dvp, vp, nm, len);
-	if (tmp != NULL)
+	if (tmp != vn_vpath_empty)
 		kmem_free(tmp, strlen(tmp) + 1);
 }
 
@@ -3054,7 +3068,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	int alloc;
 
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL) {
+	if (src->v_path == vn_vpath_empty) {
 		mutex_exit(&src->v_lock);
 		return;
 	}
@@ -3064,7 +3078,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	mutex_exit(&src->v_lock);
 	buf = kmem_alloc(alloc, KM_SLEEP);
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
+	if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) {
 		mutex_exit(&src->v_lock);
 		kmem_free(buf, alloc);
 		return;
@@ -3073,7 +3087,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
 	mutex_exit(&src->v_lock);
 
 	mutex_enter(&dst->v_lock);
-	if (dst->v_path != NULL) {
+	if (dst->v_path != vn_vpath_empty) {
 		mutex_exit(&dst->v_lock);
 		kmem_free(buf, alloc);
 		return;
@@ -3231,14 +3245,57 @@ fop_read(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int err;
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_runq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, read,
-	    read_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.reads++;
+		zonep->zone_vfs_rwstats.nread += len;
+		kstat_runq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3250,14 +3307,62 @@ fop_write(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int	err;
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for VFS write operations.  There's no
+	 * actual wait queue for VFS operations.
+	 */
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, write,
-	    write_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.writes++;
+		zonep->zone_vfs_rwstats.nwritten += len;
+		kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3421,7 +3526,7 @@ fop_lookup(
 	}
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, lookup);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
 		}
 	}
@@ -3463,7 +3568,7 @@ fop_create(
 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, create);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
 		}
 	}
@@ -3585,7 +3690,7 @@ fop_mkdir(
 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, mkdir);
-		if ((*vpp)->v_path == NULL) {
+		if ((*vpp)->v_path == vn_vpath_empty) {
 			vn_setpath(rootdir, dvp, *vpp, dirname,
 			    strlen(dirname));
 		}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 09d2e1dd8f..d2b584cf02 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -129,6 +129,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 #include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
@@ -4277,6 +4278,14 @@ top:
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_FLAG_WAIT)
 			return (zio_wait(rzio));
 
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 465dfd08b2..4b644f7479 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -617,8 +617,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		if (bonuslen)
-			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+
+		if (bonuslen) {
+			/*
+			 * Absent byzantine on-disk corruption, we fully expect
+			 * our bonuslen to be no more than DN_MAX_BONUSLEN --
+			 * but we nonetheless explicitly clamp it on the bcopy()
+			 * to prevent any on-disk corruption from becoming
+			 * rampant in-kernel corruption.
+			 */
+			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+			    MIN(bonuslen, DN_MAX_BONUSLEN));
+		}
+
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 43bcfee91c..bfe5cda5e4 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1799,7 +1799,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 			if (!zio_checksum_table[checksum].ci_dedup)
 				dedup_verify = B_TRUE;
 		}
-
 		/*
 		 * Enable nopwrite if we have a cryptographically secure
 		 * checksum that has no known collisions (i.e. SHA-256)
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 56ce2f0c27..c2b1dca1c0 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -38,11 +38,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 6a20ab3ca2..1222b8933a 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -42,6 +42,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
@@ -1266,7 +1267,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
-			    MSEC2NSEC(10), MSEC2NSEC(10));
+			    zfs_zone_txg_delay(), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 8c5d820ede..8ad7c789d3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -42,6 +42,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9ba9fd3841..02cccae29d 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -63,6 +63,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 int zfs_condense_pct = 200;
 
 /*
+ * Never condense any space map.  This is for debugging/recovery only.
+ */
+int zfs_condense_never = 0;
+
+/*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
@@ -1656,6 +1661,9 @@ metaslab_should_condense(metaslab_t *msp)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
+	if (zfs_condense_never != 0)
+		return (B_FALSE);
+
 	/*
 	 * Use the ms_size_tree range tree, which is ordered by size, to
 	 * obtain the largest segment in the free tree. We always condense
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 0b5b37f5fb..5a2602271b 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -24,6 +24,7 @@
  * Portions Copyright 2011 iXsystems, Inc
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -406,15 +407,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
 {
 	sa_os_t *sa = os->os_sa;
 	sa_lot_t *tb, *findtb;
-	int i;
+	int i, size;
 	avl_index_t loc;
 
 	ASSERT(MUTEX_HELD(&sa->sa_lock));
 	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
 	tb->lot_attr_count = attr_count;
-	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
-	    KM_SLEEP);
-	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+
+	if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) {
+		tb->lot_attrs = kmem_alloc(size, KM_SLEEP);
+		bcopy(attrs, tb->lot_attrs, size);
+	}
+
 	tb->lot_num = lot_num;
 	tb->lot_hash = hash;
 	tb->lot_instance = 0;
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 17a18a3199..bde9b0a7ab 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -116,6 +116,7 @@ struct vdev_queue {
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
 	uint64_t	vq_last_offset;
+	zoneid_t	vq_last_zone_id;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
 };
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..f1431b3f55
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern hrtime_t zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t,
+    avl_tree_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 198dc92387..78c9355438 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -421,7 +421,8 @@ struct zio {
 	const zio_vsd_ops_t *io_vsd_ops;
 
 	uint64_t	io_offset;
-	hrtime_t	io_timestamp;
+	hrtime_t	io_timestamp;	/* time I/O entered zio pipeline */
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
 
@@ -450,6 +451,7 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 191259e75b..915c9bb4b2 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -31,6 +31,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ZFS Transaction Groups
@@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index ed4a8b773b..e4cc42452a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -21,11 +21,13 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
@@ -44,6 +46,11 @@ extern ldi_ident_t zfs_li;
 
 static void vdev_disk_close(vdev_t *);
 
+typedef struct vdev_disk_buf {
+	buf_t	vdb_buf;
+	zio_t	*vdb_io;
+} vdev_disk_buf_t;
+
 typedef struct vdev_disk_ldi_cb {
 	list_node_t		lcb_next;
 	ldi_callback_id_t	lcb_id;
@@ -127,6 +134,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
     int ldi_result, void *arg, void *ev_data)
 {
 	vdev_t *vd = (vdev_t *)arg;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	/*
 	 * Ignore events other than offline.
@@ -586,6 +595,7 @@ static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_ldi_cb_t *lcb;
 
 	if (vd->vdev_reopening || dvd == NULL)
 		return;
@@ -812,6 +822,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
 }
@@ -821,6 +833,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 79d6e13b3b..4c02013405 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -33,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ZFS I/O Scheduler
@@ -141,7 +143,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10;
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_min_active = 3;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
@@ -237,6 +239,8 @@ vdev_queue_init(vdev_t *vd)
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
+	vq->vq_last_zone_id = 0;
+
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);
 
@@ -274,6 +278,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -289,6 +294,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
@@ -648,7 +654,11 @@ again:
 	search.io_timestamp = 0;
 	search.io_offset = vq->vq_last_offset + 1;
 	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+#ifdef _KERNEL
+	zio = zfs_zone_schedule(vq, p, idx, tree);
+#else
 	zio = avl_nearest(tree, idx, AVL_AFTER);
+#endif
 	if (zio == NULL)
 		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index bd7424b55b..2adb297937 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -847,9 +848,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		if (zp->z_links <= zp_is_dir) {
 			zfs_panic_recover("zfs: link count on %s is %u, "
 			    "should be at least %u",
-			    zp->z_vnode->v_path ? zp->z_vnode->v_path :
-			    "<unknown>", (int)zp->z_links,
-			    zp_is_dir + 1);
+			    zp->z_vnode->v_path != vn_vpath_empty ?
+			    zp->z_vnode->v_path : "<unknown>",
+			    (int)zp->z_links, zp_is_dir + 1);
 			zp->z_links = zp_is_dir + 1;
 		}
 		if (--zp->z_links == zp_is_dir) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 692b49611d..47b0b6f650 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -22,8 +22,8 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
@@ -611,9 +611,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
+	case ZFS_PROP_DEDUP:
 	case ZFS_PROP_ZONED:
 		/*
-		 * Disallow setting of 'zoned' from within a local zone.
+		 * Disallow setting these properties from within a local zone.
 		 */
 		if (!INGLOBALZONE(curproc))
 			return (SET_ERROR(EPERM));
@@ -943,6 +944,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
+	if (secpolicy_fs_import(cr) != 0)
+		return (set_errno(EPERM));
+
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
 		return (error);
@@ -2034,7 +2038,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -2052,7 +2057,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -2079,11 +2085,24 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
-		error = zfs_ioc_objset_stats_impl(zc, os);
+		error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 		dmu_objset_rele(os, FTAG);
 	}
 
@@ -2277,8 +2296,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
@@ -2307,8 +2339,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -3018,6 +3052,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -3061,8 +3096,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -3071,13 +3107,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 8cf83b399a..f9b7986c56 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1950,6 +1951,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 
+	/*
+	 * If we're doing a forced unmount on a dataset which still has
+	 * references and is in a zone, then we need to cleanup the zone
+	 * reference at this point or else the zone will never be able to
+	 * shutdown.
+	 */
+	if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+		zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+		vfsp->vfs_zone = NULL;
+	}
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index bf75097d2a..ddec59ffc9 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -22,11 +22,16 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.  All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -662,6 +667,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
+	int		prev_error;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
@@ -683,6 +689,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
+	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
+	 */
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(n, uio);
+
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -735,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	}
 
 	/*
-	 * Pre-fault the pages to ensure slow (eg NFS) pages
-	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
-	 */
-	if ((uio->uio_extflg == UIO_XUIO) &&
-	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
-		xuio = (xuio_t *)uio;
-	else
-		uio_prefaultpages(MIN(n, max_blksz), uio);
-
-	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
@@ -966,7 +972,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
-			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
@@ -976,18 +981,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
+		/*
+		 * Keep track of a possible pre-existing error from a partial
+		 * write via dmu_write_uio_dbuf above.
+		 */
+		prev_error = error;
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
-		if (error != 0)
+		if (prev_error != 0 || error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
-
-		if (!xuio && n > 0)
-			uio_prefaultpages(MIN(n, max_blksz), uio);
 	}
 
 	zfs_range_unlock(rl);
@@ -3659,18 +3666,6 @@ top:
 		}
 	}
 
-	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
-	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
-
-	/*
-	 * notify the target directory if it is not the same
-	 * as source directory.
-	 */
-	if (tdvp != sdvp) {
-		vnevent_rename_dest_dir(tdvp, ct);
-	}
-
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
@@ -3711,8 +3706,12 @@ top:
 		return (error);
 	}
 
-	if (tzp)	/* Attempt to remove the existing target */
+	if (tzp) {
+		/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		if (error == 0)
+			vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+	}
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
@@ -3754,6 +3753,11 @@ top:
 	}
 
 	dmu_tx_commit(tx);
+
+	if (error == 0) {
+		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+		vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
+	}
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
@@ -4244,6 +4248,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
@@ -4779,10 +4785,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..4861c64f8e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1336 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ *     1) Ensure consistent and predictable I/O latency across all zones.
+ *     2) Sequential and random workloads have very different characteristics,
+ *        so it is a non-starter to track IOPS or throughput.
+ *     3) A zone should be able to use the full disk bandwidth if no other zone
+ *        is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ *     (# of read syscalls) x (Average read latency) +
+ *     (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	return (MSEC2NSEC(10));
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint16_t	zfs_zone_delay_step = 5;	/* usec amnt to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* usec delay max */
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the
+ * threshold at which the throttle will start delaying zones.  When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems).  We
+ * therefore use our derived utilization conservatively:  we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+uint_t		zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here:  zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle.  The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t		zfs_zone_laggard_threshold = 50000;	/* 50 ms */
+uint_t		zfs_zone_laggard_recent = 1000000;	/* 1000 ms */
+uint_t		zfs_zone_laggard_ancient = 5000000;	/* 5000 ms */
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	int		cycle_cnt;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+	boolean_t zi_underutil;
+	boolean_t zi_overutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ *    ((now - zfs_zone_last_checked) * 1000);
+ */
+kmutex_t	zfs_disk_lock;		/* protects the following: */
+uint_t		zfs_disk_rcnt;		/* Number of outstanding IOs */
+hrtime_t	zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
+
+hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+hrtime_t	zfs_zone_last_checked = 0;
+hrtime_t	zfs_disk_last_laggard = 0;
+
+/*
+ * Data used to keep track of how often txg sync is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_sync_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on the zfs_vdev_sync_read_max_active value for the
+ * number of I/Os that can be pending on a device.  If there are more than the
+ * max_active ops already queued up, beyond those already issued to the vdev,
+ * then use zone-based scheduling to get the next synchronous zio.
+ */
+uint32_t	zfs_zone_schedule_thresh = 10;
+
+/*
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define	SCHED_WEIGHT_MAX	20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+hrtime_t	zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
+
+typedef struct {
+	int		zq_qdepth;
+	zio_priority_t	zq_queue;
+	int		zq_priority;
+	int		zq_wt;
+	zoneid_t	zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+		zonep->zone_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+		zonep->zone_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+		zonep->zone_lwr_ops.cycle_cnt++;
+		break;
+	}
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static boolean_t
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (B_FALSE);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (B_TRUE);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		rd_lat.cycle_cnt++;
+		rd_lat.cycle_lat += lat;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		wr_lat.cycle_cnt++;
+		wr_lat.cycle_lat += lat;
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+		    uintptr_t, cp->sys_avg_lat,
+		    uintptr_t, cp->cycle_lat,
+		    uintptr_t, cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zonep, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+	    uintptr_t, *wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+		zonep->zone_io_util = 0;
+		return (0);
+	}
+
+	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += zonep->zone_io_util;
+
+	if (zonep->zone_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += zonep->zone_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+	    uint_t, rops, uint_t, wops, uint_t, lwops,
+	    uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+		zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+	if (zonep->zone_io_delay > 0)
+		zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint16_t delay = zonep->zone_io_delay;
+	uint_t fairutil = 0;
+
+	zonep->zone_io_util_above_avg = B_FALSE;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
+		zonep->zone_io_util_above_avg = B_TRUE;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(zonep);
+	} else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+	    sp->zi_active <= 1) {
+		zfs_zone_delay_dec(zonep);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+	    uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
+	    uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
+{
+	zoneio_stats_t stats;
+	hrtime_t laggard_udelta = 0;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	if (unow > zfs_disk_last_laggard)
+		laggard_udelta = unow - zfs_disk_last_laggard;
+
+	/*
+	 * To minimize porpoising, we have three separate states for our
+	 * assessment of I/O performance:  overutilized, underutilized, and
+	 * neither overutilized nor underutilized.  We will increment the
+	 * throttle if a zone is using more than its fair share _and_ I/O
+	 * is overutilized; we will decrement the throttle if a zone is using
+	 * less than its fair share _or_ I/O is underutilized.
+	 */
+	stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+	    laggard_udelta > zfs_zone_laggard_ancient;
+
+	stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+	    laggard_udelta < zfs_zone_laggard_recent;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+	    uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+	    uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+	    uintptr_t, stats.zi_diskutil);
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	uint_t cnt;
+	zone_q_bump_t *qbp = arg;
+	zio_priority_t p = qbp->zq_queue;
+
+	cnt = zonep->zone_zfs_queued[p];
+	if (cnt == 0) {
+		zonep->zone_zfs_weight = 0;
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
+		zonep->zone_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / cnt) *
+	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * it becomes the new leading contender.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = zonep->zone_zfs_weight;
+	}
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue. This is only
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
+ *
+ * For single-threaded synchronous processes a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple processes
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel processes (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded processes, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p,
+    avl_tree_t *tree)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	qbump.zq_queue = p;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(tree); zp != NULL;
+		    zp = avl_walk(tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+		    uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out TX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track and throttle IO operations per zone. Called from:
+ *   - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes
+ *     go through this path)
+ *   - arc_read for read ops that miss the ARC (both dataset and zvol)
+ * For each operation, increment that zone's counter based on the type of
+ * operation, then delay the operation, if necessary.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone. We only see logical writes in this
+ *    function, we see the physcial writes in the zfs_zone_zio_start and
+ *    zfs_zone_zio_done functions.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice; first because this function
+ * is always called by a zone thread for logical writes, but then we also will
+ * count the physical writes that are performed at a low level via
+ * zfs_zone_zio_start. Without this, it can look like a non-global zone never
+ * writes (case 1). Depending on when the TXG is synced, the counts may be in
+ * the same sample bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through arc_read and we only come into this
+ * function when we have an arc miss.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zone_t *zonep = curzone;
+	hrtime_t unow, last_checked;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counter for logical writes here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, type, 0);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	/*
+	 * If the zone's I/O priority is set to zero, don't throttle that zone's
+	 * operations at all.
+	 */
+	if (zonep->zone_zfs_io_pri == 0)
+		return;
+
+	/*
+	 * XXX There's a potential race here in that more than one thread may
+	 * update the zone delays concurrently.  The worst outcome is corruption
+	 * of our data to track each zone's IO, so the algorithm may make
+	 * incorrect throttling decisions until the data is refreshed.
+	 */
+	last_checked = zfs_zone_last_checked;
+	if ((unow - last_checked) > zfs_zone_adjust_time) {
+		zfs_zone_last_checked = unow;
+		zfs_zone_wait_adjust(unow, last_checked);
+	}
+
+	if ((wait = zonep->zone_io_delay) > 0) {
+		/*
+		 * If this is a write and we're doing above normal TXG
+		 * syncing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_sync_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+		    uintptr_t, type, uintptr_t, wait);
+
+		drv_usecwait(wait);
+
+		if (zonep->zone_vfs_stats != NULL) {
+			atomic_inc_64(&zonep->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&zonep->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the sync rate is going to be 1.  When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The sync rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_sync_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_sync_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+	if (curzone->zone_io_util_above_avg)
+		return (zfs_zone_txg_delay_nsec);
+
+	return (MSEC2NSEC(10));
+}
+
+/*
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_zfs_lock);
+	if (zp->io_type == ZIO_TYPE_READ)
+		kstat_runq_enter(&zonep->zone_zfs_rwstats);
+	zonep->zone_zfs_weight = 0;
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_disk_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_t	*zonep;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if (zp->io_dispatched == 0)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zonep->zone_zfs_lock);
+
+	/*
+	 * To calculate the wsvc_t average, keep a cumulative sum of all the
+	 * wait time before each I/O was dispatched.  Since most writes are
+	 * asynchronous, only track the wait time for read I/Os.
+	 */
+	if (zp->io_type == ZIO_TYPE_READ) {
+		zonep->zone_zfs_rwstats.reads++;
+		zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+		    zp->io_dispatched - zp->io_timestamp;
+
+		kstat_runq_exit(&zonep->zone_zfs_rwstats);
+	} else {
+		zonep->zone_zfs_rwstats.writes++;
+		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	}
+
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+
+	if (udelta > zfs_zone_laggard_threshold)
+		zfs_disk_last_laggard = unow;
+
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	zone_rele(zonep);
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+	    uintptr_t, zp->io_type, uintptr_t, udelta);
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_t	*zonep;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	ASSERT(zonep->zone_zfs_queued[p] > 0);
+	if (zonep->zone_zfs_queued[p] == 0)
+		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+	else
+		zonep->zone_zfs_queued[p]--;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zio_priority_t p;
+	zone_t	*zonep;
+
+	p = zp->io_priority;
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return;
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	zonep->zone_zfs_queued[p]++;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. That function is where zio's are listed
+ * in FIFO order on one of the sync queues, then pulled off (by
+ * vdev_queue_io_remove) and issued.  We potentially do zone-based scheduling
+ * here to find a zone's zio deeper in the sync queue and issue that instead
+ * of simply doing FIFO.
+ *
+ * We only do zone-based zio scheduling for the two synchronous I/O queues
+ * (read & write). These queues are normally serviced in FIFO order but we
+ * may decide to move a zone's zio to the head of the line. A typical I/O
+ * load will be mostly synchronous reads and some asynchronous writes (which
+ * are scheduled differently due to transaction groups). There will also be
+ * some synchronous writes for those apps which want to ensure their data is on
+ * disk. We want to make sure that a zone with a single-threaded app (e.g. the
+ * shell) that is doing synchronous I/O (typically reads) isn't penalized by
+ * other zones which are doing lots of synchronous I/O because they have many
+ * running threads.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx,
+    avl_tree_t *tree)
+{
+	vdev_queue_class_t *vqc = &vq->vq_class[p];
+	uint_t cnt;
+	zoneid_t last_zone;
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* Don't change the order on the LBA ordered queues. */
+	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+		return (avl_nearest(tree, idx, AVL_AFTER));
+
+	/* We depend on p being defined as either 0 or 1 */
+	ASSERT(p < 2);
+
+	cnt = avl_numnodes(tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few zios in the queue then just issue the head.
+	 * If there are more than a few zios already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zio = avl_nearest(tree, idx, AVL_AFTER);
+	else
+		zio = get_next_zio(vqc, cnt, p, tree);
+
+	vq->vq_last_zone_id = zio->io_zoneid;
+
+	/*
+	 * Probe with 4 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, the zone that was associated
+	 * with the next IO that is scheduled, and which queue (priority).
+	 */
+	DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+	    uint_t, zio->io_zoneid, uint_t, p);
+
+	return (zio);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 38b57e0123..f9f93999db 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1801,9 +1802,18 @@ zil_close(zilog_t *zilog)
 	if (lwb != NULL)
 		txg = lwb->lwb_max_txg;
 	mutex_exit(&zilog->zl_lock);
-	if (txg)
+
+	if (zilog_is_dirty(zilog)) {
+		/*
+		 * If we're dirty, always wait for the current transaction --
+		 * our lwb_max_txg may be in the past.
+		 */
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	} else if (txg) {
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	ASSERT(!zilog_is_dirty(zilog));
+	}
+
+	VERIFY(!zilog_is_dirty(zilog));
 
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index aa0f2945dd..2fa3213be0 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/sysmacros.h>
@@ -39,6 +40,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ==========================================================================
@@ -557,11 +559,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 80888103fe..f681b1dc65 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,7 +25,7 @@
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -83,6 +83,7 @@
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfeature.h>
@@ -137,6 +138,11 @@ typedef struct zvol_state {
 #define	ZVOL_EXCL	0x4
 #define	ZVOL_WCE	0x8
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * zvol maximum transfer in one DMU tx.
  */
@@ -1378,6 +1384,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1394,6 +1403,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_runq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1403,6 +1420,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
+		tot_bytes += bytes;
 		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
@@ -1412,6 +1430,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 	}
 	zfs_range_unlock(rl);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.reads++;
+	zonep->zone_vfs_rwstats.nread += tot_bytes;
+	kstat_runq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	return (error);
 }
 
@@ -1425,6 +1476,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	rl_t *rl;
 	int error = 0;
 	boolean_t sync;
+	zone_t *zonep = curzone;
+	uint64_t tot_bytes;
+	hrtime_t start, lat;
 
 	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
@@ -1441,6 +1495,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for zvol write operations. There's no
+	 * actual wait queue for zvol operations.
+	 */
+	mutex_enter(&zonep->zone_vfs_lock);
+	kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+	start = gethrtime();
+	tot_bytes = 0;
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1454,6 +1521,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
+		tot_bytes += bytes;
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
@@ -1471,6 +1539,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
+	mutex_enter(&zonep->zone_vfs_lock);
+	zonep->zone_vfs_rwstats.writes++;
+	zonep->zone_vfs_rwstats.nwritten += tot_bytes;
+	kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+	mutex_exit(&zonep->zone_vfs_lock);
+
+	lat = gethrtime() - start;
+
+	if (lat >= VOP_LATENCY_10MS) {
+		zone_vfs_kstat_t *zvp;
+
+		zvp = zonep->zone_vfs_stats;
+		if (lat < VOP_LATENCY_100MS) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_1S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+		} else if (lat < VOP_LATENCY_10S) {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+		} else {
+			atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+		}
+	}
+
 	return (error);
 }