39 files changed, 8813 insertions, 99 deletions
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index fb1d93d06b..89c5decbf0 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -1142,9 +1142,21 @@ sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
 	struct sdev_node *parent = VTOSDEV(dvp);
 	int error;
 
-	/* execute access is required to search the directory */
-	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
-		return (error);
+	/*
+	 * We must check that we have execute access to search the directory --
+	 * but because our sdev_contents lock is already held as a reader (the
+	 * caller must have done a VOP_RWLOCK()), we call directly into the
+	 * underlying access routine if sdev_attr is non-NULL.
+	 */
+	if (parent->sdev_attr != NULL) {
+		VERIFY(RW_READ_HELD(&parent->sdev_contents));
+
+		if (sdev_unlocked_access(parent, VEXEC, cred) != 0)
+			return (EACCES);
+	} else {
+		if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+			return (error);
+	}
 
 	ASSERT(parent);
 	if (!SDEV_IS_GLOBAL(parent))
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..f7f2944a2e
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+		vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define	HL_HASH_SIZE	8192		/* must be power of 2 */
+#define	HL_MUTEX_SIZE	64
+
+static hldirent_t	*hl_hashtable[HL_HASH_SIZE];
+static kmutex_t		 hl_hashmutex[HL_MUTEX_SIZE];
+
+#define	HL_HASH_INDEX(a)	((a) & (HL_HASH_SIZE-1))
+#define	HL_MUTEX_INDEX(a)	((a) & (HL_MUTEX_SIZE-1))
+
+#define	HYPRLOFS_HASH(tp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+void
+hyprlofs_hash_init(void)
+{
+	int	ix;
+
+	for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+		mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+	h->hld_hash = hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	h->hld_link = *prevpp;
+	*prevpp = h;
+	mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = h->hld_hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	while (*prevpp != h)
+		prevpp = &(*prevpp)->hld_link;
+	*prevpp = h->hld_link;
+	mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+    hlnode_t **found)
+{
+	hldirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+	hlnode_t	*hnp;
+
+	HYPRLOFS_HASH(parent, name, hash);
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	l = hl_hashtable[HL_HASH_INDEX(hash)];
+	while (l) {
+		if (l->hld_hash == hash && l->hld_parent == parent &&
+		    strcmp(l->hld_name, name) == 0) {
+			/*
+			 * Ensure that the hlnode that we put a hold on is the
+			 * same one that we pass back. Thus the temp. var
+			 * hnp is necessary.
+			 */
+			hnp = l->hld_hlnode;
+			if (hold) {
+				ASSERT(hnp);
+				hlnode_hold(hnp);
+			}
+			if (found)
+				*found = hnp;
+			mutex_exit(hmtx);
+			return (l);
+		} else {
+			l = l->hld_link;
+		}
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+	int error;
+
+	*foundtp = NULL;
+	if (parent->hln_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		hlnode_hold(parent);
+		*foundtp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name. We need the lock
+	 * protecting the hln_dir list so that it doesn't change out from
+	 * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+	 * with a hold on it.
+	 */
+	if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+		ASSERT(*foundtp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+	hlfsmount_t	*hm,
+	hlnode_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	vnode_t		*realvp,	/* real vnode */
+	vattr_t		*va,
+	hlnode_t	**hpp,		/* return hlnode */
+	cred_t		*cr)
+{
+	hldirent_t *hdp;
+	hlnode_t *found = NULL;
+	hlnode_t *hp;
+	int error = 0;
+	char *s;
+
+	/* hln_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	/* Don't allow '/' characters in pathname component */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("hyprlofs_direnter: NULL name");
+
+	/*
+	 * This might be a "dangling detached directory". It could have been
+	 * removed, but a reference to it kept in u_cwd. Don't bother searching
+	 * it, and with any luck the user will get tired of dealing with us and
+	 * cd to some absolute pathway. This is in ufs, too.
+	 */
+	if (dir->hln_nlink == 0) {
+		return (ENOENT);
+	}
+
+	/* Search for the entry.  Return "found" if it exists. */
+	hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+	if (hdp) {
+		ASSERT(found);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (hpp) {
+				*hpp = found;
+				error = EEXIST;
+			} else {
+				hlnode_rele(found);
+			}
+			break;
+		}
+	} else {
+
+		/*
+		 * The entry does not exist. Check write perms in dir to see if
+		 * entry can be created.
+		 */
+		if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+			return (error);
+
+		/* Make new hlnode and directory entry as required. */
+		if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+		    cr)))
+			return (error);
+
+		if ((error = hldiraddentry(dir, hp, name))) {
+			/* Unmake the inode we just made. */
+			rw_enter(&hp->hln_rwlock, RW_WRITER);
+			if ((hp->hln_type) == VDIR) {
+				ASSERT(hdp == NULL);
+				/* cleanup allocs made by hyprlofs_dirinit() */
+				hyprlofs_dirtrunc(hp);
+			}
+			mutex_enter(&hp->hln_tlock);
+			hp->hln_nlink = 0;
+			mutex_exit(&hp->hln_tlock);
+			gethrestime(&hp->hln_ctime);
+			rw_exit(&hp->hln_rwlock);
+			hlnode_rele(hp);
+			hp = NULL;
+		} else if (hpp) {
+			*hpp = hp;
+		} else {
+			hlnode_rele(hp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+    cred_t *cr)
+{
+	hldirent_t *hpdp;
+	int error;
+	size_t namelen;
+	hlnode_t *hnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+	/* return error if removing . or .. */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if (error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr))
+		return (error);
+
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+	if (hpdp == NULL) {
+		/*
+		 * If it is gone, some other thread got here first!
+		 * Return error ENOENT.
+		 */
+		return (ENOENT);
+	}
+
+	/*
+	 * If the hlnode in the hldirent changed (shouldn't happen since we
+	 * don't support rename) then original is gone, so return that status
+	 * (same as UFS).
+	 */
+	if (hp != hnp)
+		return (ENOENT);
+
+	hyprlofs_hash_out(hpdp);
+
+	/* Take hpdp out of the directory list. */
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+	if (hpdp->hld_prev) {
+		hpdp->hld_prev->hld_next = hpdp->hld_next;
+	}
+	if (hpdp->hld_next) {
+		hpdp->hld_next->hld_prev = hpdp->hld_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match hpdp, point it at the
+	 * previous dirent.
+	 */
+	if (dir->hln_dir->hld_prev == hpdp) {
+		dir->hln_dir->hld_prev = hpdp->hld_prev;
+	}
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	/* hpdp points to the correct directory entry */
+	namelen = strlen(hpdp->hld_name) + 1;
+
+	hyprlofs_memfree(hpdp, sizeof (hldirent_t) + namelen);
+	dir->hln_size -= (sizeof (hldirent_t) + namelen);
+	dir->hln_dirents--;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+	hp->hln_ctime = now;
+
+	ASSERT(hp->hln_nlink > 0);
+	DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+	if (op == DR_RMDIR && hp->hln_type == VDIR) {
+		hyprlofs_dirtrunc(hp);
+		ASSERT(hp->hln_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+	hlnode_t *parent,	/* parent of directory to initialize */
+	hlnode_t *dir)		/* the new directory */
+{
+	hldirent_t *dot, *dotdot;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	dot = hyprlofs_memalloc(sizeof (hldirent_t) + 2, HL_MUSTHAVE);
+	dotdot = hyprlofs_memalloc(sizeof (hldirent_t) + 3, HL_MUSTHAVE);
+
+	/* Initialize the entries */
+	dot->hld_hlnode = dir;
+	dot->hld_offset = 0;
+	dot->hld_name = (char *)dot + sizeof (hldirent_t);
+	dot->hld_name[0] = '.';
+	dot->hld_parent = dir;
+	hyprlofs_hash_in(dot);
+
+	dotdot->hld_hlnode = parent;
+	dotdot->hld_offset = 1;
+	dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+	dotdot->hld_name[0] = '.';
+	dotdot->hld_name[1] = '.';
+	dotdot->hld_parent = dir;
+	hyprlofs_hash_in(dotdot);
+
+	/* Initialize directory entry list. */
+	dot->hld_next = dotdot;
+	dot->hld_prev = dotdot;
+	dotdot->hld_next = NULL;
+	dotdot->hld_prev = dot;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	/*
+	 * Since hyprlofs_dirinit is called with both dir and parent being the
+	 * same for the root vnode, we need to increment this before we set
+	 * hln_nlink = 2 below.
+	 */
+	INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+	parent->hln_ctime = now;
+
+	dir->hln_dir = dot;
+	dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+	dir->hln_dirents = 2;
+	dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+	hldirent_t *hdp;
+	hlnode_t *tp;
+	size_t namelen;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (dir->hln_looped)
+		return;
+
+	for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+		ASSERT(hdp->hld_next != hdp);
+		ASSERT(hdp->hld_prev != hdp);
+		ASSERT(hdp->hld_hlnode);
+
+		dir->hln_dir = hdp->hld_next;
+		namelen = strlen(hdp->hld_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this dir entry removal.
+		 */
+		tp = hdp->hld_hlnode;
+
+		ASSERT(tp->hln_nlink > 0);
+		DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+		hyprlofs_hash_out(hdp);
+
+		hyprlofs_memfree(hdp, sizeof (hldirent_t) + namelen);
+		dir->hln_size -= (sizeof (hldirent_t) + namelen);
+		dir->hln_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	ASSERT(dir->hln_dir == NULL);
+	ASSERT(dir->hln_size == 0);
+	ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+    hlnode_t	*dir,	/* target directory to make entry in */
+    hlnode_t	*hp,	/* new hlnode */
+    char	*name)
+{
+	hldirent_t	*hdp, *hpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent dir wasn't removed from underneath the caller.
+	 */
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same FS. */
+	if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Alloc and init dir entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (hldirent_t);
+	hdp = hyprlofs_memalloc(alloc_size, 0);
+	if (hdp == NULL)
+		return (ENOSPC);
+
+	dir->hln_size += alloc_size;
+	dir->hln_dirents++;
+	hdp->hld_hlnode = hp;
+	hdp->hld_parent = dir;
+
+	/* The dir entry and its name were allocated sequentially. */
+	hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+	(void) strcpy(hdp->hld_name, name);
+
+	hyprlofs_hash_in(hdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain fairly
+	 * static.  For example, a routine which unlinks files between calls to
+	 * readdir(); the size of the dir changes from underneath it and so the
+	 * real dir offset in bytes is invalid.  To circumvent this problem, we
+	 * initialize a dir entry with a phony offset, and use this offset to
+	 * determine end of file in hyprlofs_readdir.
+	 */
+	hpdp = dir->hln_dir->hld_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+	    hpdp->hld_offset) <= 1) {
+		ASSERT(hpdp->hld_next != hpdp);
+		ASSERT(hpdp->hld_prev != hpdp);
+		ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+		hpdp = hpdp->hld_next;
+	}
+	hdp->hld_offset = hpdp->hld_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which is
+	 * necessarily the largest offset in this dir) is more than twice the
+	 * number of dirents, that means the dir is 50% holes.  At this point
+	 * we reset the slot pointer back to the beginning of the dir so we
+	 * start using the holes. The idea is that if there are N dirents,
+	 * there must also be N holes, so we can satisfy the next N creates by
+	 * walking at most 2N entries; thus the average cost of a create is
+	 * constant. Note that we use the first dirent's hld_prev as the roving
+	 * slot pointer. This saves a word in every dirent.
+	 */
+	if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+		dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+	else
+		dir->hln_dir->hld_prev = hdp;
+
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	hdp->hld_next = hpdp->hld_next;
+	if (hdp->hld_next) {
+		hdp->hld_next->hld_prev = hdp;
+	}
+	hdp->hld_prev = hpdp;
+	hpdp->hld_next = hdp;
+
+	ASSERT(hdp->hld_next != hdp);
+	ASSERT(hdp->hld_prev != hdp);
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+    vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+	hlnode_t	*hp;
+	enum vtype	type;
+
+	ASSERT(va != NULL);
+	ASSERT(op == DE_CREATE || op == DE_MKDIR);
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+	type = va->va_type;
+	hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+	hyprlofs_node_init(hm, hp, va, cr);
+
+	hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+	hp->hln_vnode->v_type = type;
+	hp->hln_uid = crgetuid(cr);
+
+	/*
+	 * To determine the gid of the created file:
+	 *   If the directory's set-gid bit is set, set the gid to the gid
+	 *   of the parent dir, otherwise, use the process's gid.
+	 */
+	if (dir->hln_mode & VSGID)
+		hp->hln_gid = dir->hln_gid;
+	else
+		hp->hln_gid = crgetgid(cr);
+
+	/*
+	 * If we're creating a dir and the parent dir has the set-GID bit set,
+	 * set it on the new dir. Otherwise, if the user is neither privileged
+	 * nor a member of the file's new group, clear the file's set-GID bit.
+	 */
+	if (dir->hln_mode & VSGID && type == VDIR)
+		hp->hln_mode |= VSGID;
+	else {
+		if ((hp->hln_mode & VSGID) &&
+		    secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+			hp->hln_mode &= ~VSGID;
+	}
+
+	if (va->va_mask & AT_ATIME)
+		hp->hln_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		hp->hln_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		hyprlofs_dirinit(dir, hp);
+		hp->hln_looped = 0;
+	} else {
+		hp->hln_realvp = realvp;
+		hp->hln_size = va->va_size;
+		hp->hln_looped = 1;
+	}
+
+	*newnode = hp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..bf71b2bfcb
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define	MODESHIFT	3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+	vnode_t *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+	h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	h->hln_mask = 0;
+	h->hln_type = vap->va_type;
+	h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+	h->hln_nlink = 1;
+	h->hln_size = 0;
+
+	if (cr == NULL) {
+		h->hln_uid = vap->va_uid;
+		h->hln_gid = vap->va_gid;
+	} else {
+		h->hln_uid = crgetuid(cr);
+		h->hln_gid = crgetgid(cr);
+	}
+
+	h->hln_fsid = hm->hlm_dev;
+	h->hln_rdev = vap->va_rdev;
+	h->hln_blksize = PAGESIZE;
+	h->hln_nblocks = 0;
+	gethrestime(&now);
+	h->hln_atime = now;
+	h->hln_mtime = now;
+	h->hln_ctime = now;
+	h->hln_seq = 0;
+	h->hln_dir = NULL;
+
+	h->hln_vnode = vn_alloc(KM_SLEEP);
+	vp = HLNTOV(h);
+	vn_setops(vp, hyprlofs_vnodeops);
+	vp->v_vfsp = hm->hlm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)h;
+	mutex_enter(&hm->hlm_contents);
+	/*
+	 * Increment the pseudo generation number for this hlnode. Since
+	 * hlnodes are allocated and freed, there really is no particular
+	 * generation number for a new hlnode.  Just fake it by using a
+	 * counter in each file system.
+	 */
+	h->hln_gen = hm->hlm_gen++;
+
+	/*
+	 * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+	 * Root dir is handled specially in hyprlofs_mount.
+	 */
+	if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+		h->hln_forw = NULL;
+		h->hln_back = hm->hlm_rootnode->hln_back;
+		h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+	}
+	mutex_exit(&hm->hlm_contents);
+	vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+	hlnode_t *hp = vtp;
+	int shift = 0;
+
+	/* Check access based on owner, group and public perms in hlnode. */
+	if (crgetuid(cr) != hp->hln_uid) {
+		shift += MODESHIFT;
+		if (groupmember(hp->hln_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+	    hp->hln_mode << shift, mode));
+}
+
+/*
+ * Allocate zeroed memory if hyprlofs_maxkmem has not been exceeded or the
+ * 'musthave' flag is set. 'musthave' allocations should always be subordinate
+ * to normal allocations so that hyprlofs_maxkmem can't be exceeded by more
+ * than a few KB.  E.g. when creating a new dir, the hlnode is a normal
+ * allocation; if that succeeds, the dirents for "." and ".." are 'musthave'
+ * allocations.
+ */
+void *
+hyprlofs_memalloc(size_t size, int musthave)
+{
+	if (atomic_add_long_nv(&hyprlofs_kmemspace, size) < hyprlofs_maxkmem ||
+	    musthave)
+		return (kmem_zalloc(size, KM_SLEEP));
+
+	atomic_add_long(&hyprlofs_kmemspace, -size);
+	cmn_err(CE_WARN, "hyprlofs over memory limit");
+	return (NULL);
+}
+
+void
+hyprlofs_memfree(void *cp, size_t size)
+{
+	kmem_free(cp, size);
+	atomic_add_long(&hyprlofs_kmemspace, -size);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..afe76d0629
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,625 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems.  It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute.  Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses.  Entries in the name space are not mounts and
+ * thus do not appear in the mnttab.  Entries in the name space are allowed to
+ * refer to files on different backing file systems.  Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs.  There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications.  That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place.  It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space.  The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed.  The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /.  The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"hyprlofs",
+	hyprlofsinit,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+	&hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+	0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(hyprlofsfstype);
+	vn_freevnodeops(hyprlofs_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace;	/* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t	hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+	static const fs_operation_def_t hl_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = hyprlofs_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = hyprlofs_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = hyprlofs_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = hyprlofs_statvfs },
+		VFSNAME_VGET,		{ .vfs_vget = hyprlofs_vget },
+		NULL,			NULL
+	};
+	int error;
+	extern  void    hyprlofs_hash_init();
+
+	hyprlofs_hash_init();
+	hyprlofsfstype = fstype;
+	ASSERT(hyprlofsfstype != 0);
+
+	error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, hyprlofs_vnodeops_template,
+	    &hyprlofs_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * hyprlofs_minfree is an absolute limit of swap space which still
+	 * allows other processes to execute.  Set it if its not patched.
+	 */
+	if (hyprlofs_minfree == 0)
+		hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+	/*
+	 * The maximum amount of space hyprlofs can allocate is
+	 * HYPRLOFSMAXPROCKMEM percent of kernel memory
+	 */
+	if (hyprlofs_maxkmem == 0)
+		hyprlofs_maxkmem =
+		    MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM);
+
+	if ((hyprlofs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "hyprlofsinit: Can't get unique device number.");
+		hyprlofs_major = 0;
+	}
+	mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	hlfsmount_t *hm = NULL;
+	hlnode_t *hp;
+	struct pathname dpn;
+	int error;
+	vattr_t rattr;
+	int got_attrs;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* Having the resource be anything but "swap" doesn't make sense. */
+	vfs_setresource(vfsp, "swap", 0);
+
+	if (error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
+		goto out;
+
+	if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) {
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Get an available minor device number for this mount */
+	mutex_enter(&hyprlofs_minor_lock);
+	do {
+		hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+		hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+	} while (vfs_devismounted(hm->hlm_dev));
+	mutex_exit(&hyprlofs_minor_lock);
+
+	/*
+	 * Set but don't bother entering the mutex since hlfsmount is not on
+	 * the mount list yet.
+	 */
+	mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	hm->hlm_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)hm;
+	vfsp->vfs_fstype = hyprlofsfstype;
+	vfsp->vfs_dev = hm->hlm_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+	hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE);
+	(void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root hlnode structure */
+	bzero(&rattr, sizeof (vattr_t));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+	hyprlofs_node_init(hm, hp, &rattr, cr);
+
+	/* Get the mode, uid, and gid from the underlying mount point. */
+	rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+	HLNTOV(hp)->v_flag |= VROOT;
+
+	/*
+	 * If the getattr succeeded, use its results, otherwise allow the
+	 * previously set defaults to prevail.
+	 */
+	if (got_attrs == 0) {
+		hp->hln_mode = rattr.va_mode;
+		hp->hln_uid = rattr.va_uid;
+		hp->hln_gid = rattr.va_gid;
+	}
+
+	/*
+	 * Initialize linked list of hlnodes so that the back pointer of the
+	 * root hlnode always points to the last one on the list and the
+	 * forward pointer of the last node is null
+	 */
+	hp->hln_back = hp;
+	hp->hln_forw = NULL;
+	hp->hln_nlink = 0;
+	hm->hlm_rootnode = hp;
+
+	hyprlofs_dirinit(hp, hp);
+
+	rw_exit(&hp->hln_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+
+out:
+	return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hnp, *cancel;
+	vnode_t	*vp;
+	int error;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	/*
+	 * forced unmount is not supported by this file system
+	 * and thus, ENOTSUP, is being returned.
+	 */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	mutex_enter(&hm->hlm_contents);
+
+	/*
+	 * If there are no open files, only the root node should have a ref cnt.
+	 * With hlm_contents held, nothing can be added or removed. There may
+	 * be some dirty pages.  To prevent fsflush from disrupting the unmount,
+	 * put a hold on each node while scanning. If we find a previously
+	 * referenced node, undo the holds we have placed and fail EBUSY.
+	 */
+	hnp = hm->hlm_rootnode;
+	if (HLNTOV(hnp)->v_count > 1) {
+		mutex_exit(&hm->hlm_contents);
+		return (EBUSY);
+	}
+
+	for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+		if ((vp = HLNTOV(hnp))->v_count > 0) {
+			cancel = hm->hlm_rootnode->hln_forw;
+			while (cancel != hnp) {
+				vp = HLNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->hln_forw;
+			}
+			mutex_exit(&hm->hlm_contents);
+			return (EBUSY);
+		}
+		VN_HOLD(vp);
+	}
+
+	/* We can drop the mutex now because no one can find this mount */
+	mutex_exit(&hm->hlm_contents);
+
+	/*
+	 * Free all alloc'd memory associated with this FS. To do this, we go
+	 * through the file list twice, once to remove all the dir entries, and
+	 * then to remove all the files.
+	 */
+
+	/* Remove all directory entries */
+	for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+		rw_enter(&hnp->hln_rwlock, RW_WRITER);
+		if (hnp->hln_type == VDIR)
+			hyprlofs_dirtrunc(hnp);
+		rw_exit(&hnp->hln_rwlock);
+	}
+
+	ASSERT(hm->hlm_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place. VN_RELE
+	 * should make the node disappear, unless somebody is holding pages
+	 * against it.  Wait and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+	 * from blowing it away (in hyprlofs_inactive) while we're trying to
+	 * get to it here. Once we have a HOLD on it we know it'll stick around.
+	 */
+	mutex_enter(&hm->hlm_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+		mutex_exit(&hm->hlm_contents);
+		/* Note we handled the link count in pass 2 above. */
+		vp = HLNTOV(hnp);
+		VN_RELE(vp);
+		mutex_enter(&hm->hlm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again.
+		 */
+		if (hnp == hm->hlm_rootnode->hln_back) {
+			VN_HOLD(vp);
+			mutex_exit(&hm->hlm_contents);
+			delay(hz / 4);
+			mutex_enter(&hm->hlm_contents);
+		}
+	}
+	mutex_exit(&hm->hlm_contents);
+
+	VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+	ASSERT(hm->hlm_mntpath);
+
+	hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+	mutex_destroy(&hm->hlm_contents);
+	hyprlofs_memfree(hm, sizeof (hlfsmount_t));
+
+	return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = hm->hlm_rootnode;
+	vnode_t *vp;
+
+	ASSERT(hp);
+
+	vp = HLNTOV(hp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	/*
+	 * The FS may have been mounted by the GZ on behalf of the NGZ.  In
+	 * that case, the hlfsmount zone_id will be the global zone.  We want
+	 * to show the swap cap inside the zone in this case, even though the
+	 * FS was mounted by the GZ.
+	 */
+	if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+		zp = curproc->p_zone;
+	else
+		zp = hm->hlm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > hyprlofs_minfree)
+		sbp->f_bfree = blocks - hyprlofs_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is what's available plus what's been used
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a NGZ with a swap cap, then report the
+		 * capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * This is fairly inaccurate since it doesn't take into account the
+	 * names stored in the directory entries.
+	 */
+	if (hyprlofs_maxkmem > hyprlofs_kmemspace)
+		sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) /
+		    (sizeof (hlnode_t) + sizeof (hldirent_t));
+	else
+		sbp->f_ffree = 0;
+
+	sbp->f_files = hyprlofs_maxkmem /
+	    (sizeof (hlnode_t) + sizeof (hldirent_t));
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+	/*
+	 * ensure null termination
+	 */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+	hlfid_t *hfid;
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = NULL;
+
+	hfid = (hlfid_t *)fidp;
+	*vpp = NULL;
+
+	mutex_enter(&hm->hlm_contents);
+	for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+		mutex_enter(&hp->hln_tlock);
+		if (hp->hln_nodeid == hfid->hlfid_ino) {
+			/*
+			 * If the gen numbers don't match we know the file
+			 * won't be found since only one hlnode can have this
+			 * number at a time.
+			 */
+			if (hp->hln_gen != hfid->hlfid_gen ||
+			    hp->hln_nlink == 0) {
+				mutex_exit(&hp->hln_tlock);
+				mutex_exit(&hm->hlm_contents);
+				return (0);
+			}
+			*vpp = (vnode_t *)HLNTOV(hp);
+
+			VN_HOLD(*vpp);
+
+			if ((hp->hln_mode & S_ISVTX) &&
+			    !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+				mutex_enter(&(*vpp)->v_lock);
+				(*vpp)->v_flag |= VISSWAP;
+				mutex_exit(&(*vpp)->v_lock);
+			}
+			mutex_exit(&hp->hln_tlock);
+			mutex_exit(&hm->hlm_contents);
+			return (0);
+		}
+		mutex_exit(&hp->hln_tlock);
+	}
+	mutex_exit(&hm->hlm_contents);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..45b7d4db87
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+		caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+		int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call.  This is only used to validate that
+ * the input list looks sane.
+ */
+#define	MAX_IOCTL_PARAMS	100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t *rvp;
+	int error;
+
+	rvp = REALVP(*vpp);
+
+	if (VTOHLN(*vpp)->hln_looped == 0)
+		return (0);
+
+	/*
+	 * looped back, pass through to real vnode. Need to hold new reference
+	 * to vp since VOP_OPEN() may decide to release it.
+	 */
+	VN_HOLD(rvp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+	ASSERT(rvp->v_count > 1);
+	VN_RELE(rvp);
+
+	return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0) {
+		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+		cleanshares(vp, ttoproc(curthread)->p_pid);
+		return (0);
+	}
+
+	return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* We don't support writing to non-regular files */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	int len, cnt, error;
+	int i;
+	model_t model;
+	char path[MAXPATHLEN];
+	char nm[MAXPATHLEN];
+
+	/* We only support the hyprlofs ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/*
+	 * Check if managing hyprlofs is allowed.
+	 */
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+		model = get_udatamodel();
+
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_entries_t ebuf;
+			hyprlofs_entry_t *e;
+
+			if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+				return (EFAULT);
+			cnt = ebuf.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry_t) * cnt;
+
+			e = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(ebuf.hle_entries), e, len)) {
+				kmem_free(e, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e[i].hle_nlen == 0 ||
+				    e[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+				    != 0) {
+					kmem_free(e, len);
+					return (EFAULT);
+				}
+				nm[e[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e[i].hle_plen == 0 ||
+					    e[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin(e[i].hle_path, path,
+					    e[i].hle_plen) != 0) {
+						kmem_free(e, len);
+						return (EFAULT);
+					}
+					path[e[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e, len);
+			return (0);
+
+		} else {
+			hyprlofs_entries32_t ebuf32;
+			hyprlofs_entry32_t *e32;
+
+			if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+				return (EFAULT);
+
+			cnt = ebuf32.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry32_t) * cnt;
+
+			e32 = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+			    e32, len)) {
+				kmem_free(e32, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e32[i].hle_nlen == 0 ||
+				    e32[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin((void *)(unsigned long)
+				    e32[i].hle_name, nm,
+				    e32[i].hle_nlen) != 0) {
+					kmem_free(e32, len);
+					return (EFAULT);
+				}
+				nm[e32[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e32[i].hle_plen == 0 ||
+					    e32[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin((void *)(unsigned long)
+					    e32[i].hle_path, path,
+					    e32[i].hle_plen) != 0) {
+						kmem_free(e32, len);
+						return (EFAULT);
+					}
+					path[e32[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e32, len);
+			return (0);
+		}
+	}
+
+	if (cmd == HYPRLOFS_RM_ALL) {
+		return (hyprlofs_rm_all(vp, cr, ct, flag));
+	}
+
+	if (cmd == HYPRLOFS_GET_ENTRIES) {
+		return (hyprlofs_get_all(vp, data, cr, ct, flag));
+	}
+
+	return (ENOTTY);
+}
+
+/*ARGSUSED2*/
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+
+	mutex_enter(&tp->hln_tlock);
+	vap->va_type = vp->v_type;
+	vap->va_mode = tp->hln_mode & MODEMASK;
+	vap->va_uid = tp->hln_uid;
+	vap->va_gid = tp->hln_gid;
+	vap->va_fsid = tp->hln_fsid;
+	vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+	vap->va_nlink = tp->hln_nlink;
+	vap->va_size = (u_offset_t)tp->hln_size;
+	vap->va_atime = tp->hln_atime;
+	vap->va_mtime = tp->hln_mtime;
+	vap->va_ctime = tp->hln_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = tp->hln_rdev;
+	vap->va_seq = tp->hln_seq;
+
+	vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	mutex_exit(&tp->hln_tlock);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error = 0;
+	vattr_t *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+		return (EINVAL);
+
+	mutex_enter(&tp->hln_tlock);
+
+	get = &tp->hln_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+	    hyprlofs_taccess, tp);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&tp->hln_ctime);
+
+out:
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error;
+
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+	mutex_enter(&tp->hln_tlock);
+	error = hyprlofs_taccess(tp, mode, cr);
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *ntp = NULL;
+	int error;
+
+	if (VTOHLN(dvp)->hln_looped == 1)
+		return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp));
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(tp);
+
+	if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+		ASSERT(ntp);
+		*vpp = HLNTOV(ntp);
+	}
+	return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+    int mode, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *parent;
+	hlfsmount_t *tm;
+	int error;
+	hlnode_t *oldtp;
+	vnode_t *vp;
+
+	parent = (hlnode_t *)VTOHLN(dvp);
+	tm = (hlfsmount_t *)VTOHLM(dvp);
+	error = 0;
+	oldtp = NULL;
+
+	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+		/* we don't support the sticky bit */
+		vap->va_mode &= ~VSVTX;
+	} else if (vap->va_type == VNON) {
+		return (EINVAL);
+	}
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		oldtp = parent;
+	} else {
+		error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+	}
+
+	if (error == 0) {	/* name found */
+		ASSERT(oldtp);
+
+		rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+		/*
+		 * if create/read-only an existing directory, allow it
+		 */
+		if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+			error = EISDIR;
+		else {
+			error = hyprlofs_taccess(oldtp, mode, cr);
+		}
+
+		if (error) {
+			rw_exit(&oldtp->hln_rwlock);
+			hlnode_rele(oldtp);
+			return (error);
+		}
+
+		vp = HLNTOV(oldtp);
+		rw_exit(&oldtp->hln_rwlock);
+
+		if (vp->v_type == VREG) {
+			hlnode_rele(oldtp);
+			return (EEXIST);
+		}
+
+		vnevent_create(vp, ct);
+		return (0);
+	}
+
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+	    cr);
+	rw_exit(&parent->hln_rwlock);
+
+	return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT return
+	 * from hyprlofs_dirlookup() is a valid return.
+	 */
+	if (parent->hln_nlink == 0)
+		return (ENOENT);
+
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error == 0) {
+		ASSERT(self);
+		hlnode_rele(self);
+		/* We can't loop in under a looped in directory */
+		if (self->hln_looped)
+			return (EACCES);
+		*vpp = HLNTOV(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+	    va, &self, cr);
+	rw_exit(&parent->hln_rwlock);
+
+	if (error == 0 || error == EEXIST) {
+		hlnode_rele(self);
+		*vpp = HLNTOV(self);
+	}
+
+	return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+    cred_t *cr, caller_context_t *ct)
+{
+	int error;
+	char *p, *pnm;
+	vnode_t *realvp, *dvp;
+	vattr_t va;
+
+	/* Get vnode for the real file/dir. */
+	if (error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, &realvp))
+		return (error);
+
+	/* no devices allowed */
+	if (IS_DEVVP(realvp))
+		return (ENODEV);
+
+	/*
+	 * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+	 * to trigger the mount of the intended filesystem. This causes a
+	 * loopback mount of the intended filesystem instead of the AUTOFS
+	 * filesystem.
+	 */
+	if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/*
+	 * We're interested in the top most filesystem. This is specially
+	 * important when fspath is a trigger AUTOFS node, since we're really
+	 * interested in mounting the filesystem AUTOFS mounted as result of
+	 * the VOP_ACCESS() call not the AUTOFS node itself.
+	 */
+	if (vn_mountedvfs(realvp) != NULL) {
+		if (error = traverse(&realvp)) {
+			VN_RELE(realvp);
+			return (error);
+		}
+	}
+
+	va.va_type = VNON;
+	/*
+	 * If the target name is a path, make sure we have all of the
+	 * intermediate directories, creating them if necessary.
+	 */
+	dvp = vp;
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		if (va.va_type == VNON)
+			/* use the top-level dir as the template va for mkdir */
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0)
+				return (error);
+
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+		    error != EEXIST)
+			return (error);
+
+		*p = '/';
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Now use the real file's va as the template va */
+	if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0)
+		return (error);
+
+	/* Make the vnode */
+	return (hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct));
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error;
+	char *p, *pnm;
+	hlnode_t *parent;
+	hlnode_t *fndtp;
+
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	/*
+	 * If the target name is a path, get the containing dir and simple
+	 * file name.
+	 */
+	parent = (hlnode_t *)VTOHLN(dvp);
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+			return (error);
+
+		dvp = HLNTOV(fndtp);
+		parent = fndtp;
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Remove the entry from the parent dir */
+	return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error = 0;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively remove contents of this subdir */
+			if (fndhp->hln_type == VDIR) {
+				vnode_t *tvp = HLNTOV(fndhp);
+
+				error = hyprlofs_rm_all(tvp, cr, ct, flags);
+				if (error != 0)
+					goto done;
+			}
+		}
+
+		/* remove the entry */
+		error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+		if (error != 0)
+			goto done;
+
+		hdp = hp->hln_dir;
+	}
+
+done:
+	hlnode_rele(hp);
+	return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+    char *prefix, int *pcnt, int n_max,
+    cred_t *cr, caller_context_t *ct, int flags)
+{
+	int error = 0;
+	int too_big = 0;
+	int cnt;
+	int len;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+	char *path;
+
+	cnt = *pcnt;
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+		vnode_t *tvp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively get contents of this subdir */
+			VERIFY(fndhp->hln_type == VDIR);
+			tvp = HLNTOV(fndhp);
+
+			if (*prefix == '\0')
+				(void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+			else
+				(void) snprintf(path, MAXPATHLEN, "%s/%s",
+				    prefix, hdp->hld_name);
+
+			error = hyprlofs_get_all_entries(tvp, hcp, path,
+			    &cnt, n_max, cr, ct, flags);
+
+			if (error == E2BIG) {
+				too_big = 1;
+				error = 0;
+			}
+			if (error != 0)
+				goto done;
+		} else {
+			if (cnt < n_max) {
+				char *p;
+
+				if (*prefix == '\0')
+					(void) strlcpy(path, hdp->hld_name,
+					    MAXPATHLEN);
+				else
+					(void) snprintf(path, MAXPATHLEN,
+					    "%s/%s", prefix, hdp->hld_name);
+
+				len = strlen(path);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(path, (void *)(hcp[cnt].hce_name),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+
+				tvp = REALVP(HLNTOV(fndhp));
+				if (tvp->v_path == NULL) {
+					p = "<unknown>";
+				} else {
+					p = tvp->v_path;
+				}
+				len = strlen(p);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(p, (void *)(hcp[cnt].hce_path),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+			}
+
+			cnt++;
+			if (cnt > n_max)
+				too_big = 1;
+		}
+
+		hdp = hdp->hld_next;
+	}
+
+done:
+	hlnode_rele(hp);
+	kmem_free(path, MAXPATHLEN);
+
+	*pcnt = cnt;
+	if (error == 0 && too_big == 1)
+		error = E2BIG;
+
+	return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int limit, cnt, error;
+	model_t model;
+	hyprlofs_curr_entry_t *e;
+
+	model = get_udatamodel();
+
+	if (model == DATAMODEL_NATIVE) {
+		hyprlofs_curr_entries_t ebuf;
+
+		if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+			return (EFAULT);
+		limit = ebuf.hce_cnt;
+		e = ebuf.hce_entries;
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+
+	} else {
+		hyprlofs_curr_entries32_t ebuf32;
+
+		if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+			return (EFAULT);
+
+		limit = ebuf32.hce_cnt;
+		e = (hyprlofs_curr_entry_t *)(unsigned long)
+		    (ebuf32.hce_entries);
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+	}
+
+	cnt = 0;
+	error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+	    flags);
+
+	if (error == 0 || error == E2BIG) {
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_curr_entries_t ebuf;
+
+			ebuf.hce_cnt = cnt;
+			if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+				return (EFAULT);
+
+		} else {
+			hyprlofs_curr_entries32_t ebuf32;
+
+			ebuf32.hce_cnt = cnt;
+			if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+				return (EFAULT);
+		}
+	}
+
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	int error;
+	hlnode_t *hp = NULL;
+
+	/* This holds the hp vnode */
+	error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+	if (error)
+		return (error);
+
+	ASSERT(hp);
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+	rw_exit(&hp->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+	hlnode_rele(hp);
+
+	return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	vnode_t *vp;
+	int error = 0;
+
+	/* Return error if removing . or .. */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&self->hln_rwlock, RW_WRITER);
+
+	vp = HLNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done1;
+	}
+	if (self->hln_type != VDIR) {
+		error = ENOTDIR;
+		goto done1;
+	}
+
+	/*
+	 * When a dir is looped in, we only remove the in-memory dir, not the
+	 * backing dir.
+	 */
+	if (self->hln_looped == 0) {
+		mutex_enter(&self->hln_tlock);
+		if (self->hln_nlink > 2) {
+			mutex_exit(&self->hln_tlock);
+			error = EEXIST;
+			goto done1;
+		}
+		mutex_exit(&self->hln_tlock);
+
+		if (vn_vfswlock(vp)) {
+			error = EBUSY;
+			goto done1;
+		}
+		if (vn_mountedvfs(vp) != NULL) {
+			error = EBUSY;
+			goto done;
+		}
+
+		/*
+		 * Check for an empty directory, i.e. only includes entries for
+		 * "." and ".."
+		 */
+		if (self->hln_dirents > 2) {
+			error = EEXIST;		/* SIGH should be ENOTEMPTY */
+			/*
+			 * Update atime because checking hln_dirents is
+			 * equivalent to reading the directory
+			 */
+			gethrestime(&self->hln_atime);
+			goto done;
+		}
+
+		error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+	} else {
+		error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+	}
+
+done:
+	if (self->hln_looped == 0)
+		vn_vfsunlock(vp);
+done1:
+	rw_exit(&self->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+	hlnode_rele(self);
+
+	return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hldirent_t *hdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	long outcount = 0;
+	long bufsize;
+	int reclen;
+	caddr_t outbuf;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+	/* assuming syscall has already called hln_rwlock */
+	ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		return (0);
+	}
+
+	/* Get space for multiple dir entries */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+	offset = 0;
+	hdp = hp->hln_dir;
+	while (hdp) {
+		namelen = strlen(hdp->hld_name);	/* no +1 needed */
+		offset = hdp->hld_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = (int)DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount)
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				break;
+			}
+			ASSERT(hdp->hld_hlnode != NULL);
+
+			/* zero out uninitialized bytes */
+			(void) strncpy(dp->d_name, hdp->hld_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+			dp->d_off = (offset_t)hdp->hld_offset + 1;
+			dp = (struct dirent64 *)
+			    ((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		hdp = hdp->hld_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!hdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&hp->hln_atime);
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	mutex_enter(&hp->hln_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's nothing to do except drop our hold.
+	 */
+	if (vp->v_count > 1 || hp->hln_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&hp->hln_tlock);
+		rw_exit(&hp->hln_rwlock);
+		return;
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&hp->hln_tlock);
+
+	/* Here's our chance to send invalid event while we're between locks */
+	vn_invalid(HLNTOV(hp));
+
+	mutex_enter(&hm->hlm_contents);
+	if (hp->hln_forw == NULL)
+		hm->hlm_rootnode->hln_back = hp->hln_back;
+	else
+		hp->hln_forw->hln_back = hp->hln_back;
+	hp->hln_back->hln_forw = hp->hln_forw;
+	mutex_exit(&hm->hlm_contents);
+	rw_exit(&hp->hln_rwlock);
+	rw_destroy(&hp->hln_rwlock);
+	mutex_destroy(&hp->hln_tlock);
+	vn_free(HLNTOV(hp));
+	hyprlofs_memfree(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfid_t *hfid;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FID(REALVP(vp), fidp, ct));
+
+	if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+		fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+		return (ENOSPC);
+	}
+
+	hfid = (hlfid_t *)fidp;
+	bzero(hfid, sizeof (hlfid_t));
+	hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+	hfid->hlfid_ino = hp->hln_nodeid;
+	hfid->hlfid_gen = hp->hln_gen;
+
+	return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+	    rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1)
+		return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+	if (write_lock) {
+		rw_enter(&hp->hln_rwlock, RW_WRITER);
+	} else {
+		rw_enter(&hp->hln_rwlock, RW_READER);
+	}
+	return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1) {
+		VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+		return;
+	}
+
+	rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+	switch (cmd) {
+	case _PC_XATTR_ENABLED:
+	case _PC_XATTR_EXISTS:
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		error = EINVAL;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = hyprlofs_open },
+	VOPNAME_CLOSE,		{ .vop_close = hyprlofs_close },
+	VOPNAME_READ,		{ .vop_read = hyprlofs_read },
+	VOPNAME_WRITE,		{ .vop_write = hyprlofs_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = hyprlofs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = hyprlofs_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = hyprlofs_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = hyprlofs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = hyprlofs_lookup },
+	VOPNAME_CREATE,		{ .error = fs_error },
+	VOPNAME_REMOVE,		{ .vop_remove = hyprlofs_remove },
+	VOPNAME_LINK,		{ .error = fs_error },
+	VOPNAME_RENAME,		{ .error = fs_error },
+	VOPNAME_MKDIR,		{ .error = fs_error },
+	VOPNAME_RMDIR,		{ .vop_rmdir = hyprlofs_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = hyprlofs_readdir },
+	VOPNAME_SYMLINK,	{ .error = fs_error },
+	VOPNAME_READLINK,	{ .error = fs_error },
+	VOPNAME_FSYNC,		{ .vop_fsync = hyprlofs_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = hyprlofs_inactive },
+	VOPNAME_FID,		{ .vop_fid = hyprlofs_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = hyprlofs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = hyprlofs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = hyprlofs_seek },
+	VOPNAME_SPACE,		{ .vop_space = hyprlofs_space },
+	VOPNAME_GETPAGE,	{ .vop_getpage = hyprlofs_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = hyprlofs_putpage },
+	VOPNAME_MAP,		{ .vop_map = hyprlofs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = hyprlofs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = hyprlofs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = hyprlofs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..c6cfdd9d6a
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,515 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define	LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+};
+
+#define	BUFSIZE 4000
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + BUFSIZE, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = BUFSIZE;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    (uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer;
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process
+		 */
+		p = prfind((pid == 1) ?
+		    curproc->p_zone->zone_proc_initpid : pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (pid + 1);
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + fd);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		lxpnp->lxpr_pid = ((p->p_pid ==
+		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		break;
+
+	case LXPR_PID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lxproc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialize cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * allocate the first vnode
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+
+	(void) strcpy(sp->f_fstr, "lxproc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..bd45e84311
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3077 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * The aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop).  However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth.  A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space.  (That is, each thread has in
+ * effect a pid.)  Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all.  Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility.  In short, this is meant to be a best
+ * effort -- no more.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" }
+};
+
+#define	PROCDIRFILES	(sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * 	====================================
+ * 	| Number | Linux      | Native     |
+ * 	| ====== | =========  | ========== |
+ *	|    7   | SIGBUS     | SIGEMT     |
+ *	|   10   | SIGUSR1    | SIGBUS     |
+ *	|   12   | SIGUSR2    | SIGSYS     |
+ *	|   16   | SIGSTKFLT  | SIGUSR1    |
+ *	|   17   | SIGCHLD    | SIGUSR2    |
+ * 	|   18   | SIGCONT    | SIGCHLD    |
+ *	|   19   | SIGSTOP    | SIGPWR     |
+ * 	|   20   | SIGTSTP    | SIGWINCH   |
+ * 	|   21   | SIGTTIN    | SIGURG     |
+ * 	|   22   | SIGTTOU    | SIGPOLL    |
+ *	|   23   | SIGURG     | SIGSTOP    |
+ * 	|   24   | SIGXCPU    | SIGTSTP    |
+ *	|   25   | SIGXFSZ    | SIGCONT    |
+ *	|   26   | SIGVTALARM | SIGTTIN    |
+ *	|   27   | SIGPROF    | SIGTTOU    |
+ *	|   28   | SIGWINCH   | SIGVTALARM |
+ *	|   29   | SIGPOLL    | SIGPROF    |
+ *	|   30   | SIGPWR     | SIGXCPU    |
+ *	|   31   | SIGSYS     | SIGXFSZ    |
+ * 	====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+	0,
+	LX_SIGHUP,
+	LX_SIGINT,
+	LX_SIGQUIT,
+	LX_SIGILL,
+	LX_SIGTRAP,
+	LX_SIGABRT,
+	LX_SIGSTKFLT,
+	LX_SIGFPE,
+	LX_SIGKILL,
+	LX_SIGBUS,
+	LX_SIGSEGV,
+	LX_SIGSYS,
+	LX_SIGPIPE,
+	LX_SIGALRM,
+	LX_SIGTERM,
+	LX_SIGUSR1,
+	LX_SIGUSR2,
+	LX_SIGCHLD,
+	LX_SIGPWR,
+	LX_SIGWINCH,
+	LX_SIGURG,
+	LX_SIGPOLL,
+	LX_SIGSTOP,
+	LX_SIGTSTP,
+	LX_SIGCONT,
+	LX_SIGTTIN,
+	LX_SIGTTOU,
+	LX_SIGVTALRM,
+	LX_SIGPROF,
+	LX_SIGXCPU,
+	LX_SIGXFSZ,
+	-1,			/* 32:  illumos SIGWAITING */
+	-1,			/* 33:  illumos SIGLWP */
+	-1,			/* 34:  illumos SIGFREEZE */
+	-1,			/* 35:  illumos SIGTHAW */
+	-1,			/* 36:  illumos SIGCANCEL */
+	-1,			/* 37:  illumos SIGLOST */
+	-1,			/* 38:  illumos SIGXRES */
+	-1,			/* 39:  illumos SIGJVM1 */
+	-1,			/* 40:  illumos SIGJVM2 */
+	LX_SIGRTMIN,		/* 41:  illumos _SIGRTMIN */
+	LX_SIGRTMIN + 1,
+	LX_SIGRTMIN + 2,
+	LX_SIGRTMIN + 3,
+	LX_SIGRTMIN + 4,
+	LX_SIGRTMIN + 5,
+	LX_SIGRTMIN + 6,
+	LX_SIGRTMIN + 7,
+	LX_SIGRTMIN + 8,
+	LX_SIGRTMIN + 9,
+	LX_SIGRTMIN + 10,
+	LX_SIGRTMIN + 11,
+	LX_SIGRTMIN + 12,
+	LX_SIGRTMIN + 13,
+	LX_SIGRTMIN + 14,
+	LX_SIGRTMIN + 15,
+	LX_SIGRTMIN + 16,
+	LX_SIGRTMIN + 17,
+	LX_SIGRTMIN + 18,
+	LX_SIGRTMIN + 19,
+	LX_SIGRTMIN + 20,
+	LX_SIGRTMIN + 21,
+	LX_SIGRTMIN + 22,
+	LX_SIGRTMIN + 23,
+	LX_SIGRTMIN + 24,
+	LX_SIGRTMIN + 25,
+	LX_SIGRTMIN + 26,
+	LX_SIGRTMIN + 27,
+	LX_SIGRTMIN + 28,
+	LX_SIGRTMIN + 29,
+	LX_SIGRTMIN + 30,
+	LX_SIGRTMAX,
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/*
+	 * We only allow reading in this file systrem
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	/*
+	 * If we are opening an underlying file only allow regular files
+	 * reject the open for anything but a regular file.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+			error = EACCES;
+		else {
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error = ldi_open_by_name("/dev/log", FREAD, cr,
+		    &lxpnp->lxpr_cons_ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(lxpnp->lxpr_cons_ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+	int		err;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	if (type == LXPR_KMSG) {
+		if ((err = ldi_close(lxpr->lxpr_cons_ldih, 0, cr)) != 0)
+			return (err);
+	}
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_empty,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_empty,		/* /proc/devices	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_empty,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_kmsg,			/* /proc/kmsg		*/
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	lxpr_read_function[type](lxpnp, uiobuf);
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context.  This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+	lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+	lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		caddr_t saddr;
+		caddr_t eaddr;
+		int type;
+		char prot[5];
+		uint32_t offset;
+		vnode_t *vp;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = seg->s_base;
+		pbuf->eaddr = seg->s_base+seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		u_longlong_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (*buf != '\0') {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize;
+	size_t rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = btopr(as->a_resvsize);
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	user_t *up;
+	cred_t *cr;
+	const gid_t *groups;
+	int    ngroups;
+	struct as *as;
+	char *status;
+	pid_t pid, ppid;
+	size_t vsize;
+	size_t rss;
+	k_sigset_t current, ignore, handle;
+	int    i, lx_sig;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    up->u_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    pid,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    p->p_fno_ctl);
+
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		mutex_enter(&p->p_lock);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    0l,
+		    ptok(rss),
+		    0l,
+		    btok(p->p_stksize),
+		    ptok(rss),
+		    0l);
+	}
+
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = lxpr_sigmap[i];
+
+		if ((lx_sig > 0) && (lx_sig < LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "\n"
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n"
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0],
+	    /* Can't do anything with linux capabilities */
+	    0,
+	    0,
+	    0);
+
+	lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri;
+	caddr_t wchan;
+	processorid_t cpu;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = p->p_sessp->s_dev;
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S'; break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R'; break;
+		case TS_ZOMB:
+			stat = 'Z'; break;
+		case TS_STOPPED:
+			stat = 'T'; break;
+		default:
+			stat = '!'; break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+		thread_unlock(t);
+	} else {
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = as->a_resvsize;
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d (%s) %c %d %d %d %d %d "
+	    "%lu %lu %lu %lu %lu "
+	    "%lu %lu %ld %ld "
+	    "%d %d %d "
+	    "%lu "
+	    "%lu "
+	    "%lu %ld %llu "
+	    "%lu %lu %u "
+	    "%lu %lu "
+	    "%lu %lu %lu %lu "
+	    "%lu "
+	    "%lu %lu "
+	    "%d "
+	    "%d"
+	    "\n",
+	    pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+	    pri, nice, p->p_lwpcnt,
+	    0l, /* itrealvalue (time before next SIGALRM) */
+	    PTOU(p)->u_ticks,
+	    vsize, rss, p->p_vmem_ctl,
+	    0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+	    0l, 0l, /* kstkesp, kstkeip */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+	    wchan,
+	    0l, 0l, /* nswap, cnswap */
+	    0, /* exit_signal */
+	    cpu);
+
+	lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	/*
+	 * Data about each interface should go here, but that shouldn't be added
+	 * unless there is an lxproc reader that actually makes use of it (and
+	 * doesn't need anything else that we refuse to provide)...
+	 */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf)
+{
+	ldi_handle_t	lh = lxpnp->lxpr_cons_ldih;
+	mblk_t		*mp;
+
+	if (ldi_getmsg(lh, &mp, NULL) == 0) {
+		/*
+		 * lxproc doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	long total_mem, free_mem, total_swap, used_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+		total_mem = physmem * PAGESIZE;
+		free_mem = freemem * PAGESIZE;
+	} else {
+		total_mem = zone->zone_phys_mem_ctl;
+		free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+	}
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = k_anoninfo.ani_max * PAGESIZE;
+		used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		used_swap = zone->zone_max_swap;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "        total:     used:    free:  shared: buffers:  cached:\n"
+	    "Mem:  %8lu %8lu %8lu %8u %8u %8u\n"
+	    "Swap: %8lu %8lu %8lu\n"
+	    "MemTotal:  %8lu kB\n"
+	    "MemFree:   %8lu kB\n"
+	    "MemShared: %8u kB\n"
+	    "Buffers:   %8u kB\n"
+	    "Cached:    %8u kB\n"
+	    "SwapCached:%8u kB\n"
+	    "Active:    %8u kB\n"
+	    "Inactive:  %8u kB\n"
+	    "HighTotal: %8u kB\n"
+	    "HighFree:  %8u kB\n"
+	    "LowTotal:  %8u kB\n"
+	    "LowFree:   %8u kB\n"
+	    "SwapTotal: %8lu kB\n"
+	    "SwapFree:  %8lu kB\n",
+	    total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+	    total_swap, used_swap, total_swap - used_swap,
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap - used_swap));		/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	zone_t *zone = LXPTOZ(lxpnp);
+	struct print_data {
+		refstr_t *vfs_mntpt;
+		refstr_t *vfs_resource;
+		uint_t vfs_flag;
+		int vfs_fstype;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *printp;
+
+	vfs_list_read_lock();
+
+	if (zone == global_zone) {
+		vfsp = vfslist = rootvfs;
+	} else {
+		vfsp = vfslist = zone->zone_vfslist;
+		/*
+		 * If the zone has a root entry, it will be the first in
+		 * the list.  If it doesn't, we conjure one up.
+		 */
+		if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+		    zone->zone_rootpath) != 0) {
+			struct vfs *tvfsp;
+			/*
+			 * The root of the zone is not a mount point.  The vfs
+			 * we want to report is that of the zone's root vnode.
+			 */
+			tvfsp = zone->zone_rootvp->v_vfsp;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "/ / %s %s 0 0\n",
+			    vfssw[tvfsp->vfs_fstype].vsw_name,
+			    tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+		}
+		if (vfslist == NULL) {
+			vfs_list_unlock();
+			return;
+		}
+	}
+
+	/*
+	 * Later on we have to do a lookupname, which can end up causing
+	 * another vfs_list_read_lock() to be called. Which can lead to a
+	 * deadlock. To avoid this, we extract the data we need into a local
+	 * list, then we can run this list without holding vfs_list_read_lock()
+	 * We keep the list in the same order as the vfs_list
+	 */
+	do {
+		/* Skip mounts we shouldn't show */
+		if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+			goto nextfs;
+		}
+
+		printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+		refstr_hold(vfsp->vfs_mntpt);
+		printp->vfs_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_resource);
+		printp->vfs_resource = vfsp->vfs_resource;
+		printp->vfs_flag = vfsp->vfs_flag;
+		printp->vfs_fstype = vfsp->vfs_fstype;
+		printp->next = NULL;
+
+		*print_tail = printp;
+		print_tail = &printp->next;
+
+nextfs:
+		vfsp = (zone == global_zone) ?
+		    vfsp->vfs_next : vfsp->vfs_zone_next;
+
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	printp = print_head;
+	while (printp != NULL) {
+		struct print_data *printp_next;
+		const char *resource;
+		char *mntpt;
+		struct vnode *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(printp->vfs_mntpt);
+		resource = refstr_value(printp->vfs_resource);
+
+		if (mntpt != NULL && mntpt[0] != '\0')
+			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		else
+			mntpt = "-";
+
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+		if (error != 0)
+			goto nextp;
+
+		if (!(vp->v_flag & VROOT)) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) :
+				    mntpt;
+			}
+		} else {
+			resource = "-";
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "%s %s %s %s 0 0\n",
+		    resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+		    printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+		printp_next = printp->next;
+		refstr_rele(printp->vfs_mntpt);
+		refstr_rele(printp->vfs_resource);
+		kmem_free(printp, sizeof (*printp));
+		printp = printp_next;
+
+	}
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices.  But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "major minor  #blocks  name     rio rmerge rsect ruse "
+	    "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.  Note that
+ * we don't lie here -- we don't pretend that we're Linux.  If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) "
+	    "#%s SMP %s\n",
+	    utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+	    "gcc",
+	    __GNUC__,
+	    __GNUC_MINOR__,
+	    __GNUC_PATCHLEVEL__,
+#else
+	    "Sun C",
+	    __SUNPRO_C / 0x100,
+	    (__SUNPRO_C & 0xff) / 0x10,
+	    __SUNPRO_C & 0xf,
+#endif
+	    utsname.version,
+	    "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	uint_t cpu_nrunnable_cum = 0;
+	uint_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+		w_io_cum += CPU_STATS(cp, sys.iowait);
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_cum += NSEC_TO_TICK(tmptime);
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	lxpr_uiobuf_printf(uiobuf, "cpu %ld %ld %ld %ld %ld %ld %ld\n",
+	    user_cum, 0, sys_cum, idle_cum, 0, irq_cum, 0);
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu%d %ld %ld %ld %ld %ld %ld %ld\n",
+		    cp->cpu_id, user_ticks, 0, sys_ticks, idle_ticks,
+		    0, irq_ticks, 0);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "page %lu %lu\n"
+	    "swap %lu %lu\n"
+	    "intr %lu\n"
+	    "ctxt %lu\n"
+	    "btime %lu\n"
+	    "processes %lu\n"
+	    "procs_running %lu\n"
+	    "procs_blocked %lu\n",
+	    pgpgin_cum, pgpgout_cum,
+	    pgswapin_cum, pgswapout_cum,
+	    intr_cum,
+	    pswitch_cum,
+	    boot_time,
+	    forks_cum,
+	    cpu_nrunnable_cum,
+	    w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = vp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is a read only file system */
+	if (mode & VWRITE)
+		return (EROFS);
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (lxpnp->lxpr_type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+	int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	vnode_t *vp = NULL;
+	proc_t *p;
+	file_t *fp;
+	uint_t fd;
+	int c;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	/*
+	 * convert the string rendition of the filename
+	 * to a file descriptor
+	 */
+	fd = 0;
+	while ((c = *comp++) != '\0') {
+		int ofd;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		ofd = fd;
+		fd = 10*fd + c - '0';
+		/* integer overflow */
+		if (fd / 10 != ofd)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	p = lxpr_lock(dlxpnp->lxpr_pid);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * If the process is a zombie or system process
+	 * it can't have any open files.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/*
+	 * get us a fresh node/vnode
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * get open file info
+	 */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	/*
+	 * got the fd data so now done with this proc
+	 */
+	lxpr_unlock(p);
+
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		/*
+		 * ensure the fd is still kosher.
+		 * it may have gone between the readdir and
+		 * the lookup
+		 */
+		if (fip->fi_list[fd].uf_file == NULL) {
+			mutex_exit(&fip->fi_lock);
+			UF_EXIT(ufp);
+			lxpr_freenode(lxpnp);
+			return (NULL);
+		}
+
+		if ((fp = ufp->uf_file) != NULL)
+			vp = fp->f_vnode;
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+
+	if (vp == NULL) {
+		lxpr_freenode(lxpnp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxpnp->lxpr_realvp = vp;
+		VN_HOLD(lxpnp->lxpr_realvp);
+	}
+
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lxpr node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+	caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			ASSERT(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) {
+			return (error);
+		}
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc
+		 * structure
+		 */
+		pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+		    p->p_pid : 1);
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	p = prfind((lxpnp->lxpr_pid == 1) ?
+	    curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+	int ceof;
+	proc_t *p;
+	int fddirsize;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL)
+		return (ENOENT);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		fddirsize = 0;
+	} else {
+		fddirsize = fip->fi_nfiles;
+	}
+
+	mutex_enter(&fip->fi_lock);
+	lxpr_unlock(p);
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/* must be a symbolic link file */
+	if (vp->v_type != VLNK)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+			return (error);
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process
+			 */
+			pid = ((curproc->p_pid !=
+			    curproc->p_zone->zone_proc_initpid)
+			    ? curproc->p_pid : 1);
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..a06bef1570
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,275 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_LXPROC_H
+#define	_LXPROC_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define	LX_SIGHUP	1
+#define	LX_SIGINT	2
+#define	LX_SIGQUIT	3
+#define	LX_SIGILL	4
+#define	LX_SIGTRAP	5
+#define	LX_SIGABRT	6
+#define	LX_SIGIOT	6
+#define	LX_SIGBUS	7
+#define	LX_SIGFPE	8
+#define	LX_SIGKILL	9
+#define	LX_SIGUSR1	10
+#define	LX_SIGSEGV	11
+#define	LX_SIGUSR2	12
+#define	LX_SIGPIPE	13
+#define	LX_SIGALRM	14
+#define	LX_SIGTERM	15
+#define	LX_SIGSTKFLT	16
+#define	LX_SIGCHLD	17
+#define	LX_SIGCONT	18
+#define	LX_SIGSTOP	19
+#define	LX_SIGTSTP	20
+#define	LX_SIGTTIN	21
+#define	LX_SIGTTOU	22
+#define	LX_SIGURG	23
+#define	LX_SIGXCPU	24
+#define	LX_SIGXFSZ	25
+#define	LX_SIGVTALRM	26
+#define	LX_SIGPROF	27
+#define	LX_SIGWINCH	28
+#define	LX_SIGIO	29
+#define	LX_SIGPOLL	LX_SIGIO
+#define	LX_SIGPWR	30
+#define	LX_SIGSYS	31
+#define	LX_SIGUNUSED	31
+
+#define	LX_NSIG_WORDS	2
+#define	LX_NBPW		32
+#define	LX_NSIG		((LX_NBPW * LX_NSIG_WORDS) + 1)
+
+#define	LX_SIGRTMIN	32
+#define	LX_SIGRTMAX	LX_NSIG - 1
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define	LXPRMAXNAMELEN	14
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+	ldi_handle_t	lxpr_cons_ldih; /* ldi handle for console device */
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index ad2fed01dc..8473788d8a 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -2520,6 +2521,9 @@ nfs_srvinit(void)
 {
 	int error;
 
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (EACCES);
+
 	error = nfs_exportinit();
 	if (error != 0)
 		return (error);
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 55a48bb2cc..53709139cc 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/param.h>
@@ -935,7 +939,7 @@ pr_control32(int32_t cmd, arg32_t *argp, prnode_t *pnp, cred_t *cr)
 
 	case PCREAD:	/* read from the address space */
 	case PCWRITE:	/* write to the address space */
-		if (PROCESS_NOT_32BIT(p))
+		if (PROCESS_NOT_32BIT(p) || (pnp->pr_flags & PR_OFFMAX))
 			error = EOVERFLOW;
 		else {
 			enum uio_rw rw = (cmd == PCREAD)? UIO_READ : UIO_WRITE;
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 1294421f9f..ce925778f2 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -183,6 +187,7 @@ typedef struct prnode {
 #define	PR_INVAL	0x01		/* vnode is invalidated */
 #define	PR_ISSELF	0x02		/* vnode is a self-open */
 #define	PR_AOUT		0x04		/* vnode is for an a.out path */
+#define	PR_OFFMAX	0x08		/* vnode is a large file open */
 
 /*
  * Conversion macros.
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index a3e95a60fc..7831c1f9ea 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -23,6 +23,10 @@
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984,	 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -337,6 +341,15 @@ propen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 	}
 
 	/*
+	 * If this is a large file open, indicate that in our flags -- some
+	 * procfs structures are not off_t-neutral (e.g., priovec_t), and
+	 * the open will need to be differentiated where 32-bit processes
+	 * pass these structures across the user/kernel boundary.
+	 */
+	if (flag & FOFFMAX)
+		pnp->pr_flags |= PR_OFFMAX;
+
+	/*
 	 * Do file-specific things.
 	 */
 	switch (type) {
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
 		 * memory that can be used as swap space should do so by
 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
 		 * However, swapfs_minfree is tunable by install as a
-		 * workaround for bugid 1147463.
+		 * workaround for bugid 1147463. Note swapfs_minfree is set
+		 * to 1/8th of memory, but clamped at the limit of 256 MB.
 		 */
-		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+		new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+		    btopr(256 * 1024 * 1024));
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..f22cc3ecf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -76,7 +77,7 @@ static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	"tmpfs",
 	tmpfsinit,
-	VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
 	&tmpfs_proto_opttbl
 };
 
@@ -249,7 +250,7 @@ tmp_mount(
 		return (ENOTDIR);
 
 	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_OVERLAY) == 0 &&
+	if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (EBUSY);
@@ -286,6 +287,21 @@ tmp_mount(
 	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
 		goto out;
 
+	if (uap->flags & MS_REMOUNT) {
+		tm = (struct tmount *)VFSTOTM(vfsp);
+
+		/*
+		 * If we change the size so its less than what is currently
+		 * being used, we allow that. The file system will simply be
+		 * full until enough files have been removed to get below the
+		 * new max.
+		 */
+		mutex_enter(&tm->tm_contents);
+		tm->tm_anonmax = anonmax;
+		mutex_exit(&tm->tm_contents);
+		goto out;
+	}
+
 	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
 		pn_free(&dpn);
 		error = ENOMEM;
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index e24f2d3b32..8442894ecd 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -3877,6 +3877,8 @@ vfs_to_modname(const char *vfstype)
 		vfstype = "fdfs";
 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
 		vfstype = "nfs";
+	} else if (strcmp(vfstype, "lxproc") == 0) {
+		vfstype = "lxprocfs";
 	}
 
 	return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 382369c7fc..67f21866ec 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -65,6 +66,7 @@
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
+#include <sys/time.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
@@ -199,6 +201,11 @@ static void 		(**vsd_destructor)(void *);
 		cr = crgetmapped(cr);					\
 	}
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
@@ -3220,14 +3227,57 @@ fop_read(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int err;
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_runq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, read,
-	    read_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.reads++;
+		zonep->zone_vfs_rwstats.nread += len;
+		kstat_runq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3239,14 +3289,62 @@ fop_write(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int	err;
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for VFS write operations.  There's no
+	 * actual wait queue for VFS operations.
+	 */
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, write,
-	    write_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.writes++;
+		zonep->zone_vfs_rwstats.nwritten += len;
+		kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index e039b9cac4..98aad58025 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -125,6 +126,7 @@
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
@@ -2017,6 +2019,16 @@ arc_reclaim_needed(void)
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
+	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	if (availrmem <= pages_pp_maximum)
+		return (1);
+
 #if defined(__i386)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
@@ -2917,6 +2929,14 @@ top:
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
 
@@ -3484,9 +3504,6 @@ arc_init(void)
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
 		arc_meta_limit = zfs_arc_meta_limit;
 
-	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
-		arc_c_min = arc_meta_limit / 2;
-
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 7a0abd22b5..16e42b951a 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2703,7 +2703,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 39234eba53..743f5c4656 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -950,6 +951,7 @@ xuio_stat_wbuf_nocopy()
 }
 
 #ifdef _KERNEL
+
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
@@ -1562,7 +1564,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 
 	if (wp & WP_NOFILL) {
 		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
+		checksum = ZIO_CHECKSUM_NOPARITY;
 		compress = ZIO_COMPRESS_OFF;
 		dedup = B_FALSE;
 	}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b4579e278c..2301942907 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = SPA_MAXBLOCKSHIFT;
 	min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 5ef7f54af1..aeeefd178e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
@@ -4133,9 +4134,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
-		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-		if (err != 0)
-			break;
+		if (snapobj == new->ds_object) {
+			snap = new;
+		} else {
+			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+			if (err != 0)
+				break;
+		}
 
 		if (snap->ds_phys->ds_prev_snap_txg ==
 		    oldsnap->ds_phys->ds_creation_txg) {
@@ -4164,7 +4169,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		 * was not a snapshot of/before new.
 		 */
 		snapobj = snap->ds_phys->ds_prev_snap_obj;
-		dsl_dataset_rele(snap, FTAG);
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = EINVAL;
 			break;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 1cd49c8274..b6af7598e2 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -36,6 +36,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -839,7 +840,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    zfs_zone_txg_delay());
 			err = ERESTART;
 		}
 		dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 418a04c7c2..316b37cebd 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -40,6 +40,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
@@ -529,11 +530,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 
 	/*
 	 * If this transaction group is over 7/8ths capacity, delay
-	 * the caller 1 clock tick.  This will slow down the "fill"
-	 * rate until the sync process can catch up with us.
+	 * the caller some number of clock ticks.  This will slow down the
+	 * "fill" rate until the sync process can catch up with us.
 	 */
 	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
+		txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay());
 
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
index b748571ea0..ffca0a7dcb 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define	_SYS_VDEV_DISK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
@@ -40,14 +39,22 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
+#endif
 
+extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 1df61a587d..c297ae165c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -102,6 +102,7 @@ struct vdev_queue {
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
 	avl_tree_t	vq_pending_tree;
+	zoneid_t	vq_last_zone_id;
 	kmutex_t	vq_lock;
 };
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..496b718bd6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define	_SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#include <sys/semaphore.h>
+#include <sys/buf.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, uint64_t, boolean_t);
+#endif
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..069ec004f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern int zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index f6cf3f5349..032b77715f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef _ZIO_H
@@ -79,6 +80,8 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_SHA256_MAC,
+	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
@@ -421,6 +424,9 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
+	hrtime_t	io_start;	/* time I/O entered zio pipeline */
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 55b1f3884b..2269ef271e 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -30,6 +30,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * Pool-wide transaction groups.
@@ -411,6 +412,8 @@ txg_sync_thread(dsl_pool_t *dp)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index d7417736b4..f78580d0f1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -20,9 +20,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
@@ -325,8 +327,18 @@ vdev_disk_close(vdev_t *vd)
 }
 
 int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
-    uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	ASSERT(vd->vdev_ops == &vdev_disk_ops);
+	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+    size_t size, uint64_t offset, int flags)
 {
 	buf_t *bp;
 	int error = 0;
@@ -479,6 +491,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
 
@@ -490,6 +504,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
@@ -574,7 +590,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
-		if (vdev_disk_physio(vd_lh, (caddr_t)label,
+		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5a0d3ee970..4ea958a9f6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,12 +21,14 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
+#include <sys/zfs_zone.h>
 
 /*
  * These tunables are for performance analysis.
@@ -120,6 +122,8 @@ vdev_queue_init(vdev_t *vd)
 
 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	vq->vq_last_zone_id = 0;
 }
 
 void
@@ -139,6 +143,7 @@ static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_add(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(zio->io_vdev_tree, zio);
 }
 
@@ -146,6 +151,7 @@ static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_remove(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(zio->io_vdev_tree, zio);
 }
 
@@ -188,7 +194,11 @@ again:
 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
 		return (NULL);
 
+#ifdef _KERNEL
+	fio = lio = zfs_zone_schedule(vq);
+#else
 	fio = lio = avl_first(&vq->vq_deadline_tree);
+#endif
 
 	t = fio->io_vdev_tree;
 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 4b0f5602c1..6094e01876 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,11 +21,15 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
@@ -152,6 +156,8 @@ typedef struct raidz_map {
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
@@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 };
 
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
-	uint64_t b = zio->io_offset >> unit_shift;
-	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t b = offset >> unit_shift;
+	uint64_t s = size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
@@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 
-	rm->rm_col[c].rc_data = zio->io_data;
+	rm->rm_col[c].rc_data = data;
 
 	for (c = c + 1; c < acols; c++)
 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
@@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 			rm->rm_skipstart = 1;
 	}
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd)
 		vdev_close(vd->vdev_child[c]);
 }
 
+/*
+ * Handle a read or write request to a RAID-Z dump device.
+ *
+ * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes
+ * to the dump zvol are written across a full 128Kb block.  As a result, an
+ * individual I/O may not span all columns in the RAID-Z map; moreover, a small
+ * I/O may only span a single column.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread)
+{
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, err = 0;
+
+	uint64_t start, end, colstart, colend;
+	uint64_t coloffset, colsize, colskip;
+
+	int flags = doread ? B_READ : B_WRITE;
+
+#ifdef	_KERNEL
+
+	/*
+	 * Don't write past the end of the block
+	 */
+	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+	/*
+	 * Even if this I/O operation doesn't span the full block size, let's
+	 * treat the on-disk format as if the only blocks are the complete 128k
+	 * size.
+	 */
+	start = offset;
+	end = start + size;
+
+	/*
+	 * Allocate a RAID-Z map for this block.  Note that this block starts
+	 * from the "original" offset, this is, the offset of the extent which
+	 * contains the requisite offset of the data being read or written.
+	 */
+	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	    SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
+	    vd->vdev_nparity);
+
+	coloffset = origoffset;
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	    c++, coloffset += rc->rc_size) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+
+		/*
+		 * Find the start and end of this column in the RAID-Z matrix,
+		 * keeping in mind that the stated size and offset of the
+		 * operation may not fill the entire column for this vdev.
+		 *
+		 * If any portion of the data being read or written spans this
+		 * column, issue the appropriate operation to the child vdev.
+		 */
+		if (coloffset + rc->rc_size <= start)
+			continue;
+		if (coloffset >= end)
+			continue;
+
+		colstart = MAX(coloffset, start);
+		colend = MIN(end, coloffset + rc->rc_size);
+		colsize = colend - colstart;
+		colskip = colstart - coloffset;
+
+		VERIFY3U(colsize, <=, rc->rc_size);
+		VERIFY3U(colskip, <=, rc->rc_size);
+
+		/*
+		 * Note that the child vdev will have a vdev label at the start
+		 * of its range of offsets, hence the need for
+		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+		 * example of why this calculation is needed.
+		 */
+		if ((err = vdev_disk_physio(cvd,
+		    ((char *)rc->rc_data) + colskip, colsize,
+		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+		    flags)) != 0)
+			break;
+	}
+
+	vdev_raidz_map_free(rm);
+#endif	/* KERNEL */
+
+	return (err);
+}
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 929fc06296..baffc223a3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -23,6 +23,7 @@
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -1757,7 +1758,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -1775,7 +1777,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -1802,13 +1805,25 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
 		return (error);
 
-	error = zfs_ioc_objset_stats_impl(zc, os);
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+		return (error);
 
+	error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
@@ -2022,8 +2037,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 top:
 	if (zc->zc_cookie == 0)
 		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
@@ -2072,8 +2100,10 @@ top:
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -2789,6 +2819,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -2832,8 +2863,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -2842,13 +2874,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 0c39274caf..9fae31fa6b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -25,6 +25,10 @@
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -4145,6 +4149,8 @@ top:
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
@@ -4655,27 +4661,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	return (0);
 }
 
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * 	open()
- * 	mmap()
- * 	<modify memory>
- * 	munmap()
- * 	close()
- * 	<time lapse>
- * 	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4687,10 +4672,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..08f4f38e04
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+int
+zfs_zone_txg_delay()
+{
+	return (1);
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint16_t	zfs_zone_delay_step = 5;	/* amount to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* longest possible delay */
+
+hrtime_t	zfs_zone_last_checked = 0;
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the threshold
+ * at which the throttle will start delaying zones. The calculation should
+ * correspond closely with the %b column from iostat.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	int		cycle_cnt;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization.
+ */
+kmutex_t	zfs_disk_lock;
+uint_t		zfs_disk_rcnt;
+hrtime_t	zfs_disk_rtime = 0;
+hrtime_t	zfs_disk_rlastupdate = 0;
+
+hrtime_t	zfs_disk_last_rtime = 0;
+
+/*
+ * Data used to keep track of how often txg flush is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_flush_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the
+ * number of I/Os that can be pending on a device.  If there are more than a
+ * few ops already queued up, beyond those already issued to the vdev, then
+ * use scheduling to get the next zio.
+ */
+int		zfs_zone_schedule_thresh = 5;
+
+/*
+ * Tunables for delay throttling when TxG flush is occurring.
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+int		zfs_zone_txg_delay_ticks = 2;
+
+typedef struct {
+	int	zq_qdepth;
+	int	zq_priority;
+	int	zq_wt;
+	zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new historical count.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+		zonep->zone_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+		zonep->zone_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+		zonep->zone_lwr_ops.cycle_cnt++;
+		break;
+	}
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen
+ *
+ * at fixed intervals, we use a timestamp to see when the last update was made.
+ * If it was more than one cycle ago, then we need to decay the average by the
+ * proper number of additional cycles in which no IO was performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static int
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (0);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (1);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		rd_lat.cycle_cnt++;
+		rd_lat.cycle_lat += lat;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		wr_lat.cycle_cnt++;
+		wr_lat.cycle_lat += lat;
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
+		    uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__calc__wt__avg(
+		    (uintptr_t)cp->sys_avg_lat,
+		    (uintptr_t)cp->cycle_lat,
+		    (uintptr_t)cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zonep, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+	extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
+	    (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
+	    (uintptr_t)*wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+		zonep->zone_io_util = 0;
+		return (0);
+	}
+
+	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += zonep->zone_io_util;
+
+	if (zonep->zone_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += zonep->zone_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	extern void __dtrace_probe_zfs__zone__utilization(
+	    uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
+
+	__dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
+	    (uint_t)rops, (uint_t)wops, (uint_t)lwops,
+	    (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+		zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+	if (zonep->zone_io_delay > 0)
+		zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint16_t delay = zonep->zone_io_delay;
+	uint_t fairutil = 0;
+
+	zonep->zone_io_util_above_avg = B_FALSE;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (zonep->zone_io_util > fairutil &&
+	    sp->zi_diskutil > zfs_zone_util_threshold) {
+		zonep->zone_io_util_above_avg = B_TRUE;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(zonep);
+	} else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+		zfs_zone_delay_dec(zonep);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__throttle(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__throttle(
+	    (uintptr_t)zonep->zone_id, (uintptr_t)delay,
+	    (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
+	    (uintptr_t)zonep->zone_io_util);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow)
+{
+	zoneio_stats_t stats;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - zfs_zone_last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__stats(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
+	    (uintptr_t)(stats.zi_avgwlat),
+	    (uintptr_t)(stats.zi_active),
+	    (uintptr_t)(stats.zi_totutil),
+	    (uintptr_t)(stats.zi_totpri),
+	    (uintptr_t)(stats.zi_diskutil));
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	zone_q_bump_t *qbp = arg;
+
+	extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t);
+	__dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)(zonep->zone_zfs_queued));
+
+	if (zonep->zone_zfs_queued == 0) {
+		zonep->zone_zfs_weight = 0;
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (zonep->zone_zfs_weight < 20)
+		zonep->zone_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) *
+	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * schedule it next.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = zonep->zone_zfs_weight;
+	}
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue.
+ *
+ * For single-threaded synchronous workloads a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple workloads
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel workloads (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded workloads, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_t *vq, int qdepth)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(&vq->vq_deadline_tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL;
+		    zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t,
+		    uintptr_t, uintptr_t, uintptr_t);
+		__dtrace_probe_zfs__zone__sched__bump(
+		    (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt),
+		    (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt));
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out RX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track IO operations per zone.  Called from dmu_tx_count_write for write ops
+ * and dmu_read_uio for read ops.  For each operation, increment that zone's
+ * counter based on the type of operation.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice because this is called
+ * at a high level by a zone thread, but we also will count the phys. writes
+ * that are performed at a low level via zfs_zone_zio_start.
+ *
+ * Without this, it can look like a non-global zone never writes (case 1).
+ * Depending on when the TXG is flushed, the counts may be in the same sample
+ * bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through dmu_read_uio.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zone_t *zonep = curzone;
+	hrtime_t unow;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counters for logical operations here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, type, 0);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	/*
+	 * XXX There's a potential race here in that more than one thread may
+	 * update the zone delays concurrently.  The worst outcome is corruption
+	 * of our data to track each zone's IO, so the algorithm may make
+	 * incorrect throttling decisions until the data is refreshed.
+	 */
+	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+		zfs_zone_wait_adjust(unow);
+		zfs_zone_last_checked = unow;
+	}
+
+	if ((wait = zonep->zone_io_delay) > 0) {
+		/*
+		 * If this is a write and we're doing above normal TxG
+		 * flushing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_flush_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		extern void __dtrace_probe_zfs__zone__wait(
+		    uintptr_t, uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
+		    (uintptr_t)type, (uintptr_t)wait);
+
+		drv_usecwait(wait);
+
+		if (zonep->zone_vfs_stats != NULL) {
+			atomic_inc_64(&zonep->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&zonep->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TxG flush rate is running above the expected rate.
+ * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the flush rate is going to be 1.  When there
+ * is a heavy write load, TxG's fill up fast and the sync thread will write
+ * the TxG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The flush rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_flush_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_flush_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+int
+zfs_zone_txg_delay()
+{
+	zone_t	*zonep = curzone;
+	int delay = 1;
+
+	if (zonep->zone_io_util_above_avg)
+		delay = zfs_zone_txg_delay_ticks;
+
+	extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)delay);
+
+	return (delay);
+}
+
+/*
+ * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_zfs_lock);
+	if (zp->io_type == ZIO_TYPE_READ)
+		kstat_runq_enter(&zonep->zone_zfs_rwstats);
+	zonep->zone_zfs_weight = 0;
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_t	*zonep;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zonep->zone_zfs_lock);
+
+	/*
+	 * To calculate the wsvc_t average, keep a cumulative sum of all the
+	 * wait time before each I/O was dispatched.  Since most writes are
+	 * asynchronous, only track the wait time for read I/Os.
+	 */
+	if (zp->io_type == ZIO_TYPE_READ) {
+		zonep->zone_zfs_rwstats.reads++;
+		zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+		    zp->io_dispatched - zp->io_start;
+
+		kstat_runq_exit(&zonep->zone_zfs_rwstats);
+	} else {
+		zonep->zone_zfs_rwstats.writes++;
+		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	}
+
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	zone_rele(zonep);
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	extern void __dtrace_probe_zfs__zone__latency(
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
+	    (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	ASSERT(zonep->zone_zfs_queued > 0);
+	if (zonep->zone_zfs_queued == 0)
+		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+	else
+		zonep->zone_zfs_queued--;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	zonep->zone_zfs_queued++;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue.  This function is where zio's are found
+ * at the head of the queue (by avl_first), then pulled off (by
+ * vdev_queue_io_remove) and issued.  We do our scheduling here to find the
+ * next zio to issue.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq)
+{
+	int cnt;
+	zoneid_t last_zone;
+	zio_t *zp;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	cnt = avl_numnodes(&vq->vq_deadline_tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few ops in the queue then just issue the head.
+	 * If there are more than a few ops already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zp = avl_first(&vq->vq_deadline_tree);
+	else
+		zp = get_next_zio(vq, cnt);
+
+	vq->vq_last_zone_id = zp->io_zoneid;
+
+	/*
+	 * Probe with 3 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, and the zone that was associated
+	 * with the next IO that is scheduled.
+	 */
+	extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sched((uintptr_t)(cnt),
+	    (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid));
+
+	return (zp);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 64e9acbae1..89c88bc181 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -36,6 +36,7 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ==========================================================================
@@ -501,6 +502,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
+	zio->io_start = gethrtime();
+
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
@@ -552,11 +555,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
@@ -894,6 +900,8 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
+	zio->io_start = gethrtime();
+
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2279,6 +2287,9 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		zio->io_start = gethrtime();
+
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index c8fe20f2eb..c7dd90c45d 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -77,6 +78,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256_mac"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "noparity"},
 };
 
 enum zio_checksum
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index df9a16bccb..4dc63888fd 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -76,9 +76,11 @@
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 
 #include "zfs_namecheck.h"
 
@@ -1059,27 +1061,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 }
 
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-		    vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		int err = zvol_dumpio_vdev(vd->vdev_child[c],
-		    addr, offset, size, doread, isdump);
-		if (err != 0) {
-			numerrors++;
-		} else if (doread) {
-			break;
+	if (vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			int err = zvol_dumpio_vdev(vd->vdev_child[c],
+			    addr, offset, origoffset, size, doread, isdump);
+			if (err != 0) {
+				numerrors++;
+			} else if (doread) {
+				break;
+			}
 		}
 	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
@@ -1087,19 +1090,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
 	else if (!doread && !vdev_writeable(vd))
 		return (EIO);
 
-	dvd = vd->vdev_tsd;
-	ASSERT3P(dvd, !=, NULL);
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		return (vdev_raidz_physio(vd,
+		    addr, size, offset, origoffset, doread));
+	}
+
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
 		ASSERT(!doread);
 		if (doread)
 			return (EIO);
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
-		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-		    doread ? B_READ : B_WRITE));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
+
+		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
@@ -1131,7 +1142,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+	    size, doread, isdump);
 
 	if (!ddi_in_panic())
 		spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1322,6 +1334,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1340,6 +1354,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 	}
 	zfs_range_unlock(rl);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	return (error);
 }
 
@@ -1369,6 +1387,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1399,6 +1419,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
 	return (error);
 }
 
@@ -1852,7 +1876,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    ZIO_CHECKSUM_OFF) == 0);
+		    ZIO_CHECKSUM_NOPARITY) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),