PSARC 2002/240 ZFS

6338653 Integrate ZFS PSARC 2004/652 - DKIOCFLUSH 5096886 Write caching disks need mechanism to flush cache to physical media
author: ahrens <none@none> 2005-10-31 11:33:35 -0800
committer: ahrens <none@none> 2005-10-31 11:33:35 -0800
commit: fa9e4066f08beec538e775443c5be79dd423fcab (patch)
tree: 576d99665e57bb7cb70584431adb08c14d47e3ce /usr/src/uts/common/fs
parent: f1b64740276f67fc6914c1d855f2af601efe99ac (diff)
download: illumos-gate-fa9e4066f08beec538e775443c5be79dd423fcab.tar.gz
118 files changed, 50463 insertions, 146 deletions
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_all.c b/usr/src/uts/common/fs/ctfs/ctfs_all.c
index dd3eeb15b6..4933edd960 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_all.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_all.c
@@ -99,7 +99,7 @@ ctfs_adir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
 	if (*nm != '\0')
 		return (ENOENT);
 
-	ct = contract_ptr(i, VTOZ(vp)->zone_uniqid);
+	ct = contract_ptr(i, VTOZONE(vp)->zone_uniqid);
 	if (ct == NULL)
 		return (ENOENT);
 
@@ -118,7 +118,7 @@ ctfs_adir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
 	uint64_t zuniqid;
 	ctid_t next;
 
-	zuniqid = VTOZ(vp)->zone_uniqid;
+	zuniqid = VTOZONE(vp)->zone_uniqid;
 	next = contract_lookup(zuniqid, *offp);
 
 	if (next == -1) {
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
index a13091826c..f4980d4a97 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
@@ -249,11 +249,11 @@ ctfs_stat_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	detail = STRUCT_FGET(st, ctst_detail);
 	if (detail == CTD_COMMON) {
 		mutex_enter(&ct->ct_lock);
-		contract_status_common(ct, VTOZ(vp), STRUCT_BUF(st), mdl);
+		contract_status_common(ct, VTOZONE(vp), STRUCT_BUF(st), mdl);
 		mutex_exit(&ct->ct_lock);
 	} else if (detail <= CTD_ALL) {
 		VERIFY(nvlist_alloc(&foo, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		type->ct_type_ops->contop_status(ct, VTOZ(vp), detail, foo,
+		type->ct_type_ops->contop_status(ct, VTOZONE(vp), detail, foo,
 		    STRUCT_BUF(st), mdl);
 		VERIFY(nvlist_pack(foo, &bufp, &len, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_event.c b/usr/src/uts/common/fs/ctfs/ctfs_event.c
index afb08a7cfc..7fa7cfb608 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_event.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_event.c
@@ -287,7 +287,7 @@ ctfs_ev_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	ctfs_evnode_t *evnode = vp->v_data;
 
 	return (ctfs_endpoint_ioctl(&evnode->ctfs_ev_listener, cmd, arg, cr,
-	    VTOZ(vp), 0));
+	    VTOZONE(vp), 0));
 }
 
 /*
@@ -430,7 +430,7 @@ ctfs_bu_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	ctfs_bunode_t *bunode = vp->v_data;
 
 	return (ctfs_endpoint_ioctl(&bunode->ctfs_bu_listener, cmd, arg, cr,
-	    VTOZ(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
+	    VTOZONE(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
 }
 
 /*
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
index 479f64b064..1f5dd42370 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
@@ -108,7 +108,7 @@ ctfs_tdir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
 	ctid_t next;
 	ct_type_t *ty = ct_types[gfs_file_index(vp)];
 
-	zuniqid = VTOZ(vp)->zone_uniqid;
+	zuniqid = VTOZONE(vp)->zone_uniqid;
 	next = contract_type_lookup(ty, zuniqid, *offp);
 
 	if (next == -1) {
@@ -135,7 +135,7 @@ ctfs_tdir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
 		return (ENOENT);
 
 	ct = contract_type_ptr(ct_types[gfs_file_index(vp)], i,
-	    VTOZ(vp)->zone_uniqid);
+	    VTOZONE(vp)->zone_uniqid);
 	if (ct == NULL)
 		return (ENOENT);
 
diff --git a/usr/src/uts/common/fs/devfs/devfs_subr.c b/usr/src/uts/common/fs/devfs/devfs_subr.c
index 0f53a24ca0..864ed2ad60 100644
--- a/usr/src/uts/common/fs/devfs/devfs_subr.c
+++ b/usr/src/uts/common/fs/devfs/devfs_subr.c
@@ -569,20 +569,6 @@ dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
 }
 
 /*
- * Free a vsecattr
- */
-static void
-dv_free_vsa(struct vsecattr *vsap)
-{
-	if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp)
-		kmem_free(vsap->vsa_aclentp,
-		    vsap->vsa_aclcnt * sizeof (aclent_t));
-	if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp)
-		kmem_free(vsap->vsa_dfaclentp,
-		    vsap->vsa_dfaclcnt * sizeof (aclent_t));
-}
-
-/*
  * dv_shadow_node
  *
  * Given a VDIR dv_node, find/create the associated VDIR
@@ -623,7 +609,6 @@ dv_shadow_node(
 	int		create_tried;
 	int		error;
 	mperm_t		mp;
-	struct vsecattr	vsa;
 
 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
 	dv = VTODV(vp);
@@ -678,19 +663,14 @@ lookup:
 		dv->dv_attrvp = rvp;	/* with one hold */
 
 		/*
-		 * Determine if we have (non-trivial) ACLs on this node.
-		 * NB: This should be changed call fs_acl_nontrivial for
-		 * new ACE flavor ACLs.
+		 * Determine if we have non-trivial ACLs on this node.
+		 * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
+		 * only does VOP_GETSECATTR.
 		 */
-		vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
-		error = VOP_GETSECATTR(rvp, &vsa, 0, cred);
 		dv->dv_flags &= ~DV_ACL;
-		if (error == 0) {
-			if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) {
-				dv->dv_flags |= DV_ACL;	/* non-trivial ACL */
-			}
-			dv_free_vsa(&vsa);
-		}
+
+		if (fs_acl_nontrivial(rvp, cred))
+			dv->dv_flags |= DV_ACL;
 
 		/*
 		 * If we have synced out the memory attributes, free
diff --git a/usr/src/uts/common/fs/devfs/devfs_vnops.c b/usr/src/uts/common/fs/devfs/devfs_vnops.c
index 7a3d4c1c04..b8dfce5448 100644
--- a/usr/src/uts/common/fs/devfs/devfs_vnops.c
+++ b/usr/src/uts/common/fs/devfs/devfs_vnops.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -621,7 +621,6 @@ devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 
 	error = VOP_GETSECATTR(avp, vsap, flags, cr);
 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
-
 	rw_exit(&dv->dv_contents);
 	return (error);
 }
@@ -678,10 +677,11 @@ devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
 
 	/*
-	 * NB: This code should call fs_acl_nontrivial when available so that
-	 * DV_ACL is only set on nontrivial ACLs.
+	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
+	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
+	 * VOP_GETSECATTR calls.
 	 */
-	if (error == 0)
+	if (fs_acl_nontrivial(avp, cr))
 		dv->dv_flags |= DV_ACL;
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 7fc9dc4277..3466db3832 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -24,7 +24,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,7 @@
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/nbmlock.h>
+#include <acl/acl_common.h>
 
 static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
 
@@ -632,3 +633,84 @@ fs_vnevent_support(vnode_t *vp, vnevent_t vnevent)
 	ASSERT(vp != NULL);
 	return (0);
 }
+
+/*
+ * return 1 for non-trivial ACL.
+ *
+ * NB: It is not necessary for the caller to VOP_RWLOCK since
+ *	we only issue VOP_GETSECATTR.
+ *
+ * Returns 0 == trivial
+ *         1 == NOT Trivial
+ *	   <0 could not determine.
+ */
+int
+fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
+{
+	ulong_t		acl_styles;
+	ulong_t		acl_flavor;
+	vsecattr_t 	vsecattr;
+	int 		error;
+	int		isnontrivial;
+
+	/* determine the forms of ACLs maintained */
+	error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr);
+
+	/* clear bits we don't understand and establish default acl_style */
+	acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
+	if (error || (acl_styles == 0))
+		acl_styles = _ACL_ACLENT_ENABLED;
+
+	vsecattr.vsa_aclentp = NULL;
+	vsecattr.vsa_dfaclentp = NULL;
+	vsecattr.vsa_aclcnt = 0;
+	vsecattr.vsa_dfaclcnt = 0;
+
+	while (acl_styles) {
+		/* select one of the styles as current flavor */
+		acl_flavor = 0;
+		if (acl_styles & _ACL_ACLENT_ENABLED) {
+			acl_flavor = _ACL_ACLENT_ENABLED;
+			vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
+		} else if (acl_styles & _ACL_ACE_ENABLED) {
+			acl_flavor = _ACL_ACE_ENABLED;
+			vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
+		}
+
+		ASSERT(vsecattr.vsa_mask && acl_flavor);
+		error = VOP_GETSECATTR(vp, &vsecattr, 0, cr);
+		if (error == 0)
+			break;
+
+		/* that flavor failed */
+		acl_styles &= ~acl_flavor;
+	}
+
+	/* if all styles fail then assume trivial */
+	if (acl_styles == 0)
+		return (0);
+
+	/* process the flavor that worked */
+	isnontrivial = 0;
+	if (acl_flavor & _ACL_ACLENT_ENABLED) {
+		if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
+			isnontrivial = 1;
+		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+			kmem_free(vsecattr.vsa_aclentp,
+			    vsecattr.vsa_aclcnt * sizeof (aclent_t));
+		if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
+			kmem_free(vsecattr.vsa_dfaclentp,
+			    vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
+	}
+	if (acl_flavor & _ACL_ACE_ENABLED) {
+
+		isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
+		    vsecattr.vsa_aclcnt);
+
+		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+			kmem_free(vsecattr.vsa_aclentp,
+			    vsecattr.vsa_aclcnt * sizeof (ace_t));
+		/* ACE has no vsecattr.vsa_dfaclcnt */
+	}
+	return (isnontrivial);
+}
diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h
index 27fc845718..8cd453edba 100644
--- a/usr/src/uts/common/fs/fs_subr.h
+++ b/usr/src/uts/common/fs/fs_subr.h
@@ -23,7 +23,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -81,6 +81,7 @@ extern int	fs_shrlock(struct vnode *, int, struct shrlock *, int,
 			cred_t *);
 extern int	fs_vnevent_nosupport(vnode_t *, vnevent_t);
 extern int	fs_vnevent_support(vnode_t *, vnevent_t);
+extern int	fs_acl_nontrivial(struct vnode *vp, struct cred *cr);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 7fd7f66510..b7fdf996e2 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -789,7 +789,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	size_t dbuflen;
 	struct iovec iov;
 	struct uio uio;
-	int err;
+	int error;
 	int eof;
 	vnode_t *cmpvp;
 	struct dirent64 *dp;
@@ -811,8 +811,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	uio.uio_extflg = UIO_COPY_CACHED;
 	uio.uio_loffset = 0;
 
-	if ((err = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
-		return (err);
+	if ((error = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
+		return (error);
 
 	while (!eof) {
 		uio.uio_resid = dlen;
@@ -820,12 +820,12 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 		iov.iov_len = dlen;
 
 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
-		err = VOP_READDIR(dvp, &uio, cr, &eof);
+		error = VOP_READDIR(dvp, &uio, cr, &eof);
 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 
 		dbuflen = dlen - uio.uio_resid;
 
-		if (err || dbuflen == 0)
+		if (error || dbuflen == 0)
 			break;
 
 		dp = (dirent64_t *)dbuf;
@@ -840,7 +840,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 				continue;
 			}
 
-			err = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
+			error = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
 			    vrootp, cr);
 
 			/*
@@ -849,7 +849,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 			 * just removed an entry since the readdir() call, and
 			 * the entry we want is further on in the directory.
 			 */
-			if (err == 0) {
+			if (error == 0) {
 				if (vnode_match(tvp, cmpvp, cr)) {
 					VN_RELE(cmpvp);
 					*rdp = dp;
@@ -857,8 +857,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 				}
 
 				VN_RELE(cmpvp);
-			} else if (err != ENOENT) {
-				return (err);
+			} else if (error != ENOENT) {
+				return (error);
 			}
 
 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
@@ -868,13 +868,26 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	/*
 	 * Something strange has happened, this directory does not contain the
 	 * specified vnode.  This should never happen in the normal case, since
-	 * we ensured that dvp is the parent of vp.  This may be possible in
-	 * some race conditions, so fail gracefully.
+	 * we ensured that dvp is the parent of vp.  This is possible in some
+	 * rare conditions (races and the special .zfs directory).
 	 */
-	if (err == 0)
-		err = ENOENT;
+	if (error == 0) {
+		error = VOP_LOOKUP(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr);
+		if (error == 0) {
+			if (vnode_match(tvp, cmpvp, cr)) {
+				(void) strcpy(dp->d_name, ".zfs");
+				dp->d_reclen = strlen(".zfs");
+				dp->d_off = 2;
+				dp->d_ino = 1;
+				*rdp = dp;
+			} else {
+				error = ENOENT;
+			}
+			VN_RELE(cmpvp);
+		}
+	}
 
-	return (err);
+	return (error);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_acl.c b/usr/src/uts/common/fs/nfs/nfs4_acl.c
index 9b584f6256..96aa1756e9 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_acl.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_acl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -80,10 +80,15 @@ static int ace4_list_to_aent(ace4_list_t *, aclent_t **, int *, uid_t, gid_t,
 static int ln_ace4_to_aent(nfsace4 *ace4, int n, uid_t, gid_t,
     aclent_t **, int *, aclent_t **, int *, int, int, int);
 static int ace4_cmp(nfsace4 *, nfsace4 *);
-static int acet_to_ace4(ace_t *, nfsace4 *, int, int);
-static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int, int);
+static int acet_to_ace4(ace_t *, nfsace4 *, int);
+static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int);
 static int validate_idmapping(utf8string *, uid_t, int, int, int);
 static int u8s_mapped_to_nobody(utf8string *, uid_t, int);
+static void ace4_mask_to_acet_mask(acemask4, uint32_t *);
+static void acet_mask_to_ace4_mask(uint32_t, acemask4 *);
+static void ace4_flags_to_acet_flags(aceflag4, uint16_t *);
+static void acet_flags_to_ace4_flags(uint16_t, aceflag4 *);
+
 /*
  * The following two functions check and set ACE4_SYNCRONIZE, ACE4_WRITE_OWNER,
  * ACE4_DELETE and ACE4_WRITE_ATTRIBUTES.
@@ -1651,7 +1656,7 @@ ln_ace4_cmp(nfsace4 *a, nfsace4* b, int n)
  * strings versus integer uid/gids.
  */
 static int
-acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
+acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isserver)
 {
 	int error = 0;
 
@@ -1669,44 +1674,45 @@ acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
 	}
 
 	switch (ace->a_type) {
-	case ALLOW:
+	case ACE_ACCESS_ALLOWED_ACE_TYPE:
 		nfsace4->type = ACE4_ACCESS_ALLOWED_ACE_TYPE;
 		break;
-	case DENY:
+	case ACE_ACCESS_DENIED_ACE_TYPE:
 		nfsace4->type = ACE4_ACCESS_DENIED_ACE_TYPE;
 		break;
 	default:
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "acet_to_ace4: unsupported type: %x", ace->a_type));
 		error = ENOTSUP;
 		break;
 	}
 	if (error != 0)
 		goto out;
 
-	nfsace4->access_mask = mode_to_ace4_access(ace->a_access_mask,
-	    isdir, ace->a_flags & ACE_OWNER, ace->a_type == ALLOW, isserver);
+	acet_mask_to_ace4_mask(ace->a_access_mask, &nfsace4->access_mask);
+	acet_flags_to_ace4_flags(ace->a_flags, &nfsace4->flag);
 
-	nfsace4->flag = (ace->a_flags & ACE_NFSV4_SUP_FLAGS);
-	if (ace->a_flags & ACE_GROUPS) {
+	if (ace->a_flags & ACE_GROUP) {
+		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
+		(void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
+	} else if (ace->a_flags & ACE_IDENTIFIER_GROUP) {
 		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
 		error = nfs_idmap_gid_str(ace->a_who, &nfsace4->who, isserver);
-	} else if (ace->a_flags & ACE_USER) {
-		error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+		if (error != 0)
+			NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+			    "acet_to_ace4: idmap failed with %d", error));
 	} else if (ace->a_flags & ACE_OWNER) {
 		(void) str_to_utf8(ACE4_WHO_OWNER, &nfsace4->who);
-	} else if (ace->a_flags & ACE_GROUP) {
-		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
-		(void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
-	} else if (ace->a_flags & ACE_OTHER) {
+	} else if (ace->a_flags & ACE_EVERYONE) {
 		(void) str_to_utf8(ACE4_WHO_EVERYONE, &nfsace4->who);
+	} else {
+		error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+		if (error != 0)
+			NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+			    "acet_to_ace4: idmap failed with %d", error));
 	}
 
 out:
-#ifdef DEBUG
-	if (error != 0)
-	    NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
-		"acet_to_ace4: idmap failed with %d", error));
-#endif
-
 	return (error);
 }
 
@@ -1716,10 +1722,9 @@ out:
  */
 static int
 ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
-    int isdir, int isserver, int just_count)
+    int isserver, int just_count)
 {
 	int error = 0;
-	o_mode_t mode;
 
 	if (nfsace4 == NULL) {
 		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
@@ -1734,12 +1739,14 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 
 	switch (nfsace4->type) {
 	case ACE4_ACCESS_ALLOWED_ACE_TYPE:
-		ace->a_type = ALLOW;
+		ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
 		break;
 	case ACE4_ACCESS_DENIED_ACE_TYPE:
-		ace->a_type = DENY;
+		ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
 		break;
 	default:
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "ace4_to_acet: unsupported type: %x", nfsace4->type));
 		error = ENOTSUP;
 		break;
 	}
@@ -1761,16 +1768,15 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 		goto out;
 	}
 
-	ace->a_access_mask = nfsace4->access_mask;
-	error = ace4_mask_to_mode(nfsace4->access_mask, &mode, isdir);
-	if (error != 0)
-		goto out;
-	ace->a_access_mask = mode;
-	if (nfsace4->flag & ~(ACE_NFSV4_SUP_FLAGS | ACE4_IDENTIFIER_GROUP)) {
+	ace4_mask_to_acet_mask(nfsace4->access_mask, &ace->a_access_mask);
+
+	if (nfsace4->flag & ~ACE_NFSV4_SUP_FLAGS) {
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "ace4_to_acet: unsupported flags: %x", nfsace4->flag));
 		error = ENOTSUP;
 		goto out;
 	}
-	ace->a_flags = (nfsace4->flag & ACE_NFSV4_SUP_FLAGS);
+	ace4_flags_to_acet_flags(nfsace4->flag, &ace->a_flags);
 
 	if (nfsace4->flag & ACE4_IDENTIFIER_GROUP) {
 		if ((nfsace4->who.utf8string_len == 6) &&
@@ -1780,7 +1786,7 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 			ace->a_flags |= ACE_GROUP;
 			error = 0;
 		} else {
-			ace->a_flags |= ACE_GROUPS;
+			ace->a_flags |= ACE_IDENTIFIER_GROUP;
 			error = nfs_idmap_str_gid(&nfsace4->who,
 			    &ace->a_who, isserver);
 			if (error != 0) {
@@ -1807,10 +1813,9 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 		} else if ((nfsace4->who.utf8string_len == 9) &&
 		    (bcmp(ACE4_WHO_EVERYONE,
 		    nfsace4->who.utf8string_val, 9) == 0)) {
-			ace->a_flags |= ACE_OTHER;
+			ace->a_flags |= ACE_EVERYONE;
 			ace->a_who = 0;
 		} else {
-			ace->a_flags |= ACE_USER;
 			error = nfs_idmap_str_uid(&nfsace4->who,
 			    &ace->a_who, isserver);
 			if (error != 0) {
@@ -1830,18 +1835,124 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 	}
 
 out:
-#ifdef DEBUG
-	if (error != 0)
-		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
-		    "ace4_to_acet: idmap failed with %d", error));
-#endif
-
 	return (error);
 }
 
+static void
+ace4_mask_to_acet_mask(acemask4 ace4_mask, uint32_t *acet_mask)
+{
+	*acet_mask = 0;
+
+	if (ace4_mask & ACE4_READ_DATA)
+		*acet_mask |= ACE_READ_DATA;
+	if (ace4_mask & ACE4_WRITE_DATA)
+		*acet_mask |= ACE_WRITE_DATA;
+	if (ace4_mask & ACE4_APPEND_DATA)
+		*acet_mask |= ACE_APPEND_DATA;
+	if (ace4_mask & ACE4_READ_NAMED_ATTRS)
+		*acet_mask |= ACE_READ_NAMED_ATTRS;
+	if (ace4_mask & ACE4_WRITE_NAMED_ATTRS)
+		*acet_mask |= ACE_WRITE_NAMED_ATTRS;
+	if (ace4_mask & ACE4_EXECUTE)
+		*acet_mask |= ACE_EXECUTE;
+	if (ace4_mask & ACE4_DELETE_CHILD)
+		*acet_mask |= ACE_DELETE_CHILD;
+	if (ace4_mask & ACE4_READ_ATTRIBUTES)
+		*acet_mask |= ACE_READ_ATTRIBUTES;
+	if (ace4_mask & ACE4_WRITE_ATTRIBUTES)
+		*acet_mask |= ACE_WRITE_ATTRIBUTES;
+	if (ace4_mask & ACE4_DELETE)
+		*acet_mask |= ACE_DELETE;
+	if (ace4_mask & ACE4_READ_ACL)
+		*acet_mask |= ACE_READ_ACL;
+	if (ace4_mask & ACE4_WRITE_ACL)
+		*acet_mask |= ACE_WRITE_ACL;
+	if (ace4_mask & ACE4_WRITE_OWNER)
+		*acet_mask |= ACE_WRITE_OWNER;
+	if (ace4_mask & ACE4_SYNCHRONIZE)
+		*acet_mask |= ACE_SYNCHRONIZE;
+}
+
+static void
+acet_mask_to_ace4_mask(uint32_t acet_mask, acemask4 *ace4_mask)
+{
+	*ace4_mask = 0;
+
+	if (acet_mask & ACE_READ_DATA)
+		*ace4_mask |= ACE4_READ_DATA;
+	if (acet_mask & ACE_WRITE_DATA)
+		*ace4_mask |= ACE4_WRITE_DATA;
+	if (acet_mask & ACE_APPEND_DATA)
+		*ace4_mask |= ACE_APPEND_DATA;
+	if (acet_mask & ACE4_READ_NAMED_ATTRS)
+		*ace4_mask |= ACE_READ_NAMED_ATTRS;
+	if (acet_mask & ACE_WRITE_NAMED_ATTRS)
+		*ace4_mask |= ACE4_WRITE_NAMED_ATTRS;
+	if (acet_mask & ACE_EXECUTE)
+		*ace4_mask |= ACE4_EXECUTE;
+	if (acet_mask & ACE_DELETE_CHILD)
+		*ace4_mask |= ACE4_DELETE_CHILD;
+	if (acet_mask & ACE_READ_ATTRIBUTES)
+		*ace4_mask |= ACE4_READ_ATTRIBUTES;
+	if (acet_mask & ACE_WRITE_ATTRIBUTES)
+		*ace4_mask |= ACE4_WRITE_ATTRIBUTES;
+	if (acet_mask & ACE_DELETE)
+		*ace4_mask |= ACE4_DELETE;
+	if (acet_mask & ACE_READ_ACL)
+		*ace4_mask |= ACE4_READ_ACL;
+	if (acet_mask & ACE_WRITE_ACL)
+		*ace4_mask |= ACE4_WRITE_ACL;
+	if (acet_mask & ACE_WRITE_OWNER)
+		*ace4_mask |= ACE4_WRITE_OWNER;
+	if (acet_mask & ACE_SYNCHRONIZE)
+		*ace4_mask |= ACE4_SYNCHRONIZE;
+}
+
+static void
+ace4_flags_to_acet_flags(aceflag4 ace4_flags, uint16_t *acet_flags)
+{
+	*acet_flags = 0;
+
+	if (ace4_flags & ACE4_FILE_INHERIT_ACE)
+		*acet_flags |= ACE_FILE_INHERIT_ACE;
+	if (ace4_flags & ACE4_DIRECTORY_INHERIT_ACE)
+		*acet_flags |= ACE_DIRECTORY_INHERIT_ACE;
+	if (ace4_flags & ACE4_NO_PROPAGATE_INHERIT_ACE)
+		*acet_flags |= ACE_NO_PROPAGATE_INHERIT_ACE;
+	if (ace4_flags & ACE4_INHERIT_ONLY_ACE)
+		*acet_flags |= ACE_INHERIT_ONLY_ACE;
+	if (ace4_flags & ACE4_SUCCESSFUL_ACCESS_ACE_FLAG)
+		*acet_flags |= ACE_SUCCESSFUL_ACCESS_ACE_FLAG;
+	if (ace4_flags & ACE4_FAILED_ACCESS_ACE_FLAG)
+		*acet_flags |= ACE_FAILED_ACCESS_ACE_FLAG;
+	if (ace4_flags & ACE4_IDENTIFIER_GROUP)
+		*acet_flags |= ACE_IDENTIFIER_GROUP;
+}
+
+static void
+acet_flags_to_ace4_flags(uint16_t acet_flags, aceflag4 *ace4_flags)
+{
+	*ace4_flags = 0;
+
+	if (acet_flags & ACE_FILE_INHERIT_ACE)
+		*ace4_flags |= ACE4_FILE_INHERIT_ACE;
+	if (acet_flags & ACE_DIRECTORY_INHERIT_ACE)
+		*ace4_flags |= ACE4_DIRECTORY_INHERIT_ACE;
+	if (acet_flags & ACE_NO_PROPAGATE_INHERIT_ACE)
+		*ace4_flags |= ACE4_NO_PROPAGATE_INHERIT_ACE;
+	if (acet_flags & ACE_INHERIT_ONLY_ACE)
+		*ace4_flags |= ACE4_INHERIT_ONLY_ACE;
+	if (acet_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG)
+		*ace4_flags |= ACE4_SUCCESSFUL_ACCESS_ACE_FLAG;
+	if (acet_flags & ACE_FAILED_ACCESS_ACE_FLAG)
+		*ace4_flags |= ACE4_FAILED_ACCESS_ACE_FLAG;
+	if (acet_flags & ACE_IDENTIFIER_GROUP)
+		*ace4_flags |= ACE4_IDENTIFIER_GROUP;
+}
+
 int
 vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
-    uid_t owner, gid_t group, int isdir, int isserver, int just_count)
+    uid_t owner, gid_t group, int isserver, int just_count)
 {
 	int error;
 	int i;
@@ -1865,7 +1976,7 @@ vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
 	for (i = 0; i < vs_ace4->vsa_aclcnt; i++) {
 		error = ace4_to_acet((nfsace4 *)(vs_ace4->vsa_aclentp) + i,
 		    (ace_t *)(vs_acet->vsa_aclentp) + i, owner, group,
-		    isdir, isserver, just_count);
+		    isserver, just_count);
 		if (error != 0)
 			goto out;
 	}
@@ -1879,7 +1990,7 @@ out:
 
 int
 vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
-    int isdir, int isserver)
+    int isserver)
 {
 	int error = 0;
 	int i;
@@ -1900,7 +2011,7 @@ vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
 
 	for (i = 0; i < vs_acet->vsa_aclcnt; i++) {
 		error = acet_to_ace4((ace_t *)(vs_acet->vsa_aclentp) + i,
-		    (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isdir, isserver);
+		    (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isserver);
 		if (error != 0)
 			goto out;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index 6ef0000ea3..6169621a73 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -887,8 +887,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
 		if (error != 0)
 			break;
 		if (whichacl & _ACL_ACE_ENABLED) {
-			error = vs_acet_to_ace4(&vs_native, &vs_ace4,
-			    vp->v_type == VDIR, TRUE);
+			error = vs_acet_to_ace4(&vs_native, &vs_ace4, TRUE);
 			vs_acet_destroy(&vs_native);
 		} else {
 			error = vs_aent_to_ace4(&vs_native, &vs_ace4,
@@ -968,8 +967,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
 
 		if (whichacl & _ACL_ACE_ENABLED) {
 			error = vs_ace4_to_acet(&vs_ace4, &vs_native,
-			    vap->va_uid, vap->va_gid, vp->v_type == VDIR, TRUE,
-			    FALSE);
+			    vap->va_uid, vap->va_gid, TRUE, FALSE);
 			if (error != 0)
 				break;
 			(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d07cedb514..9ae1d0a56c 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -11982,7 +11982,7 @@ nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
 			 * These are ace_t type entries.
 			 */
 			error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
-			    vp->v_type == VDIR, FALSE);
+			    FALSE);
 			if (error)
 				return (error);
 		}
@@ -12151,7 +12151,7 @@ nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
 
 	if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
 		error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid,
-		    isdir, FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
+		    FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
 
 		if (error)
 			return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
index 836297350a..1242f94e10 100644
--- a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.
+ * Copyright 2005 Sun Microsystems, Inc.
  * All rights reserved.
  * Use is subject to license terms.
  */
@@ -68,6 +68,8 @@
 #include <nfs/nfs_clnt.h>
 #include <nfs/nfs_acl.h>
 
+#include <fs/fs_subr.h>
+
 /*
  * These are the interface routines for the server side of the
  * NFS ACL server.  See the NFS ACL protocol specification
@@ -95,6 +97,25 @@ acl2_getacl(GETACL2args *args, GETACL2res *resp, struct exportinfo *exi,
 
 	error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
 
+	if (error == ENOSYS) {
+		/*
+		 * If the underlying file system doesn't support
+		 * aclent_t type acls, fabricate an acl.  This is
+		 * required in order to to support existing clients
+		 * that require the call to VOP_GETSECATTR to
+		 * succeed while making the assumption that all
+		 * file systems support aclent_t type acls.  This
+		 * causes problems for servers exporting ZFS file
+		 * systems because ZFS supports ace_t type acls,
+		 * and fails (with ENOSYS) when asked for aclent_t
+		 * type acls.
+		 *
+		 * Note: if the fs_fab_acl() fails, we have other problems.
+		 * This error should be returned to the caller.
+		 */
+		error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+	}
+
 	if (error) {
 		VN_RELE(vp);
 		resp->status = puterrno(error);
@@ -454,6 +475,25 @@ acl3_getacl(GETACL3args *args, GETACL3res *resp, struct exportinfo *exi,
 
 	error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
 
+	if (error == ENOSYS) {
+		/*
+		 * If the underlying file system doesn't support
+		 * aclent_t type acls, fabricate an acl.  This is
+		 * required in order to to support existing clients
+		 * that require the call to VOP_GETSECATTR to
+		 * succeed while making the assumption that all
+		 * file systems support aclent_t type acls.  This
+		 * causes problems for servers exporting ZFS file
+		 * systems because ZFS supports ace_t type acls,
+		 * and fails (with ENOSYS) when asked for aclent_t
+		 * type acls.
+		 *
+		 * Note: if the fs_fab_acl() fails, we have other problems.
+		 * This error should be returned to the caller.
+		 */
+		error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+	}
+
 	if (error)
 		goto out;
 
diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c
index 79f486e9b1..844a3b7bb1 100644
--- a/usr/src/uts/common/fs/proc/prioctl.c
+++ b/usr/src/uts/common/fs/proc/prioctl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -498,7 +498,7 @@ startover:
 			 */
 			t = pr_thread(pnp);	/* returns locked thread */
 			thread_unlock(t);
-			oprgetstatus(t, &un.prstat, VTOZ(vp));
+			oprgetstatus(t, &un.prstat, VTOZONE(vp));
 			prunlock(pnp);
 			if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
 				error = EFAULT;
@@ -835,7 +835,7 @@ startover:
 		break;
 
 	case PIOCSTATUS:	/* get process/lwp status */
-		oprgetstatus(t, &un.prstat, VTOZ(vp));
+		oprgetstatus(t, &un.prstat, VTOZONE(vp));
 		prunlock(pnp);
 		if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
 			error = EFAULT;
@@ -866,13 +866,13 @@ startover:
 		Bprsp = thing;
 		thing = NULL;
 		prsp = Bprsp;
-		oprgetstatus(t, prsp, VTOZ(vp));
+		oprgetstatus(t, prsp, VTOZONE(vp));
 		t = p->p_tlist;
 		do {
 			ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
 			ASSERT(nlwp > 0);
 			--nlwp;
-			oprgetstatus(t, ++prsp, VTOZ(vp));
+			oprgetstatus(t, ++prsp, VTOZONE(vp));
 		} while ((t = t->t_forw) != p->p_tlist);
 		ASSERT(nlwp == 0);
 		prunlock(pnp);
@@ -2053,7 +2053,7 @@ startover:
 			 */
 			t = pr_thread(pnp);	/* returns locked thread */
 			thread_unlock(t);
-			oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+			oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
 			prunlock(pnp);
 			if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
 				error = EFAULT;
@@ -2430,7 +2430,7 @@ startover:
 			error = EOVERFLOW;
 			break;
 		}
-		oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+		oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
 		prunlock(pnp);
 		if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
 			error = EFAULT;
@@ -2471,13 +2471,13 @@ startover:
 		Bprsp = (prstatus32_t *)thing;
 		thing = NULL;
 		prsp = Bprsp;
-		oprgetstatus32(t, prsp, VTOZ(vp));
+		oprgetstatus32(t, prsp, VTOZONE(vp));
 		t = p->p_tlist;
 		do {
 			ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
 			ASSERT(nlwp > 0);
 			--nlwp;
-			oprgetstatus32(t, ++prsp, VTOZ(vp));
+			oprgetstatus32(t, ++prsp, VTOZONE(vp));
 		} while ((t = t->t_forw) != p->p_tlist);
 		ASSERT(nlwp == 0);
 		prunlock(pnp);
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index dea54056c6..d12ee64e8c 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -709,7 +709,7 @@ pr_read_status(prnode_t *pnp, uio_t *uiop)
 	 */
 	sp = kmem_alloc(sizeof (*sp), KM_SLEEP);
 	if ((error = prlock(pnp, ZNO)) == 0) {
-		prgetstatus(pnp->pr_common->prc_proc, sp, VTOZ(PTOV(pnp)));
+		prgetstatus(pnp->pr_common->prc_proc, sp, VTOZONE(PTOV(pnp)));
 		prunlock(pnp);
 		error = pr_uioread(sp, sizeof (*sp), uiop);
 	}
@@ -753,7 +753,7 @@ pr_read_lstatus(prnode_t *pnp, uio_t *uiop)
 		if (ldp->ld_entry == NULL ||
 		    (t = ldp->ld_entry->le_thread) == NULL)
 			continue;
-		prgetlwpstatus(t, sp, VTOZ(PTOV(pnp)));
+		prgetlwpstatus(t, sp, VTOZONE(PTOV(pnp)));
 		sp = (lwpstatus_t *)((caddr_t)sp + LSPAN(lwpstatus_t));
 	}
 	prunlock(pnp);
@@ -1426,7 +1426,7 @@ pr_read_lwpstatus(prnode_t *pnp, uio_t *uiop)
 		goto out;
 	}
 
-	prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+	prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
 	prunlock(pnp);
 
 	error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -1799,7 +1799,7 @@ pr_read_status_32(prnode_t *pnp, uio_t *uiop)
 			error = EOVERFLOW;
 		} else {
 			prgetstatus32(pnp->pr_common->prc_proc, sp,
-			    VTOZ(PTOV(pnp)));
+			    VTOZONE(PTOV(pnp)));
 			prunlock(pnp);
 			error = pr_uioread(sp, sizeof (*sp), uiop);
 		}
@@ -1852,7 +1852,7 @@ pr_read_lstatus_32(prnode_t *pnp, uio_t *uiop)
 		if (ldp->ld_entry == NULL ||
 		    (t = ldp->ld_entry->le_thread) == NULL)
 			continue;
-		prgetlwpstatus32(t, sp, VTOZ(PTOV(pnp)));
+		prgetlwpstatus32(t, sp, VTOZONE(PTOV(pnp)));
 		sp = (lwpstatus32_t *)((caddr_t)sp + LSPAN32(lwpstatus32_t));
 	}
 	prunlock(pnp);
@@ -2471,7 +2471,7 @@ pr_read_lwpstatus_32(prnode_t *pnp, uio_t *uiop)
 		goto out;
 	}
 
-	prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+	prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
 	prunlock(pnp);
 
 	error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -4281,9 +4281,9 @@ pr_lookup_ctdir(vnode_t *dp, char *comp)
 	 * outside the zone.  (see logic in contract_status_common)
 	 */
 	if ((ct->ct_owner != p) &&
-	    !(p == VTOZ(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
-	    VTOZ(dp)->zone_uniqid == contract_getzuniqid(ct) &&
-	    VTOZ(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
+	    !(p == VTOZONE(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
+	    VTOZONE(dp)->zone_uniqid == contract_getzuniqid(ct) &&
+	    VTOZONE(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
 	    ct->ct_czuniqid == GLOBAL_ZONEUNIQID)) {
 		prunlock(dpnp);
 		prfreenode(pnp);
@@ -4668,7 +4668,7 @@ pr_readdir_procdir(prnode_t *pnp, uio_t *uiop, int *eofp)
 
 	ASSERT(pnp->pr_type == PR_PROCDIR);
 
-	zoneid = VTOZ(PTOV(pnp))->zone_id;
+	zoneid = VTOZONE(PTOV(pnp))->zone_id;
 
 	if ((error = gfs_readdir_init(&gstate, PNSIZ, PRSDSIZE, uiop,
 	    PRROOTINO, PRROOTINO)) != 0)
@@ -5453,7 +5453,7 @@ pr_readdir_ctdir(prnode_t *pnp, uio_t *uiop, int *eofp)
 		return (error);
 	}
 
-	zid = VTOZ(pnp->pr_vnode)->zone_uniqid;
+	zid = VTOZONE(pnp->pr_vnode)->zone_uniqid;
 	while ((error = gfs_readdir_pred(&gstate, uiop, &n)) == 0) {
 		id_t next = contract_plookup(p, n, zid);
 		if (next == -1) {
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 1e7793ba39..4d562852af 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -720,28 +720,37 @@ top:
 		vsec.vsa_dfaclcnt = 0;
 		vsec.vsa_dfaclentp = NULL;
 		vsec.vsa_mask = VSA_DFACLCNT;
-		if (error = VOP_GETSECATTR(dvp, &vsec, 0, CRED())) {
+		error =  VOP_GETSECATTR(dvp, &vsec, 0, CRED());
+		/*
+		 * If error is ENOSYS then treat it as no error
+		 * Don't want to force all file systems to support
+		 * aclent_t style of ACL's.
+		 */
+		if (error == ENOSYS)
+			error = 0;
+		if (error) {
 			if (*vpp != NULL)
 				VN_RELE(*vpp);
 			goto out;
-		}
-
-		/*
-		 * Apply the umask if no default ACLs.
-		 */
-		if (vsec.vsa_dfaclcnt == 0)
-			vap->va_mode &= ~umask;
+		} else {
+			/*
+			 * Apply the umask if no default ACLs.
+			 */
+			if (vsec.vsa_dfaclcnt == 0)
+				vap->va_mode &= ~umask;
 
-		/*
-		 * VOP_GETSECATTR() may have allocated memory for ACLs we
-		 * didn't request, so double-check and free it if necessary.
-		 */
-		if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
-			kmem_free((caddr_t)vsec.vsa_aclentp,
-				vsec.vsa_aclcnt * sizeof (aclent_t));
-		if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
-			kmem_free((caddr_t)vsec.vsa_dfaclentp,
-				vsec.vsa_dfaclcnt * sizeof (aclent_t));
+			/*
+			 * VOP_GETSECATTR() may have allocated memory for
+			 * ACLs we didn't request, so double-check and
+			 * free it if necessary.
+			 */
+			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
+				kmem_free((caddr_t)vsec.vsa_aclentp,
+				    vsec.vsa_aclcnt * sizeof (aclent_t));
+			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
+				kmem_free((caddr_t)vsec.vsa_dfaclentp,
+				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
+		}
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
new file mode 100644
index 0000000000..0a6cc7b658
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -0,0 +1,1998 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * DVA-based Adjustable Relpacement Cache
+ *
+ * While much of the theory of operation and algorithms used here
+ * are based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory.  This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about.  Our cache is not so simple.  At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them.  Blocks are only evictable
+ * when there are no external references active.  This makes
+ * eviction far more problematic:  we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space.  In these circumstances we are unable to adjust the cache
+ * size.  To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slowes the flow of new data
+ * into the cache until we can make space avaiable.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss.  Our model has a variable sized cache.  It grows with
+ * high use, but also tries to react to memory preasure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size.  So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict.  In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes).  We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists.  The arc_read() inerface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2.  We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table.  It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state.  When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock.  Also note that
+ * the "top" state mutex must be held before the "bot" state mutex.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <vm/anon.h>
+#include <sys/fs/swapnode.h>
+#endif
+#include <sys/callb.h>
+
+static kmutex_t		arc_reclaim_thr_lock;
+static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
+static uint8_t		arc_thread_exit;
+
+typedef enum arc_reclaim_strategy {
+	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
+	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int		arc_grow_retry = 60;
+
+static kmutex_t arc_reclaim_lock;
+static int arc_dead;
+
+/*
+ * Note that buffers can be on one of 5 states:
+ *	ARC_anon	- anonymous (discussed below)
+ *	ARC_mru_top	- recently used, currently cached
+ *	ARC_mru_bot	- recentely used, no longer in cache
+ *	ARC_mfu_top	- frequently used, currently cached
+ *	ARC_mfu_bot	- frequently used, no longer in cache
+ * When there are no active references to the buffer, they
+ * are linked onto one of the lists in arc.  These are the
+ * only buffers that can be evicted or deleted.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA.  These are buffers that hold dirty block copies
+ * before they are written to stable storage.  By definition,
+ * they are "ref'd" and are considered part of arc_mru_top
+ * that cannot be freed.  Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru_top list.
+ */
+
+typedef struct arc_state {
+	list_t	list;	/* linked list of evictable buffer in state */
+	uint64_t lsize;	/* total size of buffers in the linked list */
+	uint64_t size;	/* total size of all buffers in this state */
+	uint64_t hits;
+	kmutex_t mtx;
+} arc_state_t;
+
+/* The 5 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru_top;
+static arc_state_t ARC_mru_bot;
+static arc_state_t ARC_mfu_top;
+static arc_state_t ARC_mfu_bot;
+
+static struct arc {
+	arc_state_t 	*anon;
+	arc_state_t	*mru_top;
+	arc_state_t	*mru_bot;
+	arc_state_t	*mfu_top;
+	arc_state_t	*mfu_bot;
+	uint64_t	size;		/* Actual total arc size */
+	uint64_t	p;		/* Target size (in bytes) of mru_top */
+	uint64_t	c;		/* Target size of cache (in bytes) */
+	uint64_t	c_min;		/* Minimum target cache size */
+	uint64_t	c_max;		/* Maximum target cache size */
+	uint64_t	incr;		/* Size by which to increment arc.c */
+	int64_t		size_check;
+
+	/* performance stats */
+	uint64_t	hits;
+	uint64_t	misses;
+	uint64_t	deleted;
+	uint64_t	skipped;
+	uint64_t	hash_elements;
+	uint64_t	hash_elements_max;
+	uint64_t	hash_collisions;
+	uint64_t	hash_chains;
+	uint32_t	hash_chain_max;
+
+	int		no_grow;	/* Don't try to grow cache size */
+} arc;
+
+/* Default amount to grow arc.incr */
+static int64_t arc_incr_size = 1024;
+
+/* > 0 ==> time to increment arc.c */
+static int64_t arc_size_check_default = -1000;
+
+static uint64_t arc_tempreserve;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+	arc_done_func_t		*acb_done;
+	void			*acb_private;
+	arc_byteswap_func_t	*acb_byteswap;
+	arc_buf_t		*acb_buf;
+	zio_t			*acb_zio_dummy;
+	arc_callback_t		*acb_next;
+};
+
+struct arc_buf_hdr {
+	/* immutable */
+	uint64_t		b_size;
+	spa_t			*b_spa;
+
+	/* protected by hash lock */
+	dva_t			b_dva;
+	uint64_t		b_birth;
+	uint64_t		b_cksum0;
+
+	arc_buf_hdr_t		*b_hash_next;
+	arc_buf_t		*b_buf;
+	uint32_t		b_flags;
+
+	kcondvar_t		b_cv;
+	arc_callback_t		*b_acb;
+
+	/* protected by arc state mutex */
+	arc_state_t		*b_state;
+	list_node_t		b_arc_node;
+
+	/* updated atomically */
+	clock_t			b_arc_access;
+
+	/* self protecting */
+	refcount_t		b_refcnt;
+};
+
+/*
+ * Private ARC flags.  These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read.  However, these flags
+ * should never be passed and should only be set by ARC code.  When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
+#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
+#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
+
+#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
+#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
+
+/*
+ * Hash table routines
+ */
+
+#define	HT_LOCK_PAD	64
+
+struct ht_lock {
+	kmutex_t	ht_lock;
+#ifdef _KERNEL
+	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define	BUF_LOCKS 256
+typedef struct buf_hash_table {
+	uint64_t ht_mask;
+	arc_buf_hdr_t **ht_table;
+	struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define	BUF_HASH_INDEX(spa, dva, birth) \
+	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define	HDR_LOCK(buf) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+	uintptr_t spav = (uintptr_t)spa;
+	uint8_t *vdva = (uint8_t *)dva;
+	uint64_t crc = -1ULL;
+	int i;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+	for (i = 0; i < sizeof (dva_t); i++)
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+	crc ^= (spav>>8) ^ birth;
+
+	return (crc);
+}
+
+#define	BUF_EMPTY(buf)						\
+	((buf)->b_dva.dva_word[0] == 0 &&			\
+	(buf)->b_dva.dva_word[1] == 0 &&			\
+	(buf)->b_birth == 0)
+
+#define	BUF_EQUAL(spa, dva, birth, buf)				\
+	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *buf;
+
+	mutex_enter(hash_lock);
+	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+	    buf = buf->b_hash_next) {
+		if (BUF_EQUAL(spa, dva, birth, buf)) {
+			*lockp = hash_lock;
+			return (buf);
+		}
+	}
+	mutex_exit(hash_lock);
+	*lockp = NULL;
+	return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
+static kthread_t *fbufs_lastthread;
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *fbuf;
+	uint32_t max, i;
+
+	fbufs_lastthread = curthread;
+	*lockp = hash_lock;
+	mutex_enter(hash_lock);
+	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+	    fbuf = fbuf->b_hash_next, i++) {
+		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
+			fbufs[i] = fbuf;
+		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+			return (fbuf);
+	}
+
+	buf->b_hash_next = buf_hash_table.ht_table[idx];
+	buf_hash_table.ht_table[idx] = buf;
+
+	/* collect some hash table performance data */
+	if (i > 0) {
+		atomic_add_64(&arc.hash_collisions, 1);
+		if (i == 1)
+			atomic_add_64(&arc.hash_chains, 1);
+	}
+	while (i > (max = arc.hash_chain_max) &&
+	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
+		continue;
+	}
+	atomic_add_64(&arc.hash_elements, 1);
+	if (arc.hash_elements > arc.hash_elements_max)
+		atomic_add_64(&arc.hash_elements_max, 1);
+
+	return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+	arc_buf_hdr_t *fbuf, **bufp;
+	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+
+	bufp = &buf_hash_table.ht_table[idx];
+	while ((fbuf = *bufp) != buf) {
+		ASSERT(fbuf != NULL);
+		bufp = &fbuf->b_hash_next;
+	}
+	*bufp = buf->b_hash_next;
+	buf->b_hash_next = NULL;
+
+	/* collect some hash table performance data */
+	atomic_add_64(&arc.hash_elements, -1);
+	if (buf_hash_table.ht_table[idx] &&
+	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+		atomic_add_64(&arc.hash_chains, -1);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+	int i;
+
+	kmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+	for (i = 0; i < BUF_LOCKS; i++)
+		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+	kmem_cache_destroy(hdr_cache);
+	kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *buf = vbuf;
+
+	bzero(buf, sizeof (arc_buf_hdr_t));
+	refcount_create(&buf->b_refcnt);
+	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+	return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *buf = vbuf;
+
+	refcount_destroy(&buf->b_refcnt);
+	cv_destroy(&buf->b_cv);
+}
+
+void arc_kmem_reclaim(void);
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+	dprintf("hdr_recl called\n");
+	arc_kmem_reclaim();
+}
+
+static void
+buf_init(void)
+{
+	uint64_t *ct;
+	uint64_t hsize = 1ULL << 10;
+	int i, j;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average 4k block size.  The table will take up
+	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
+	 * pointers).
+	 */
+	while (hsize * 4096 < physmem * PAGESIZE)
+		hsize <<= 1;
+
+	buf_hash_table.ht_mask = hsize - 1;
+	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+
+	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	for (i = 0; i < 256; i++)
+		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+	for (i = 0; i < BUF_LOCKS; i++) {
+		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
+	}
+}
+
+#define	ARC_MINTIME	(hz>>4) /* 62 ms */
+
+#define	ARC_TAG		(void *)0x05201962
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+	    (ab->b_state != arc.anon)) {
+
+		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+		mutex_enter(&ab->b_state->mtx);
+		ASSERT(!refcount_is_zero(&ab->b_refcnt));
+		ASSERT(list_link_active(&ab->b_arc_node));
+		list_remove(&ab->b_state->list, ab);
+		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+		ab->b_state->lsize -= ab->b_size;
+		mutex_exit(&ab->b_state->mtx);
+	}
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+	int cnt;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+	    (ab->b_state != arc.anon)) {
+
+		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+		mutex_enter(&ab->b_state->mtx);
+		ASSERT(!list_link_active(&ab->b_arc_node));
+		list_insert_head(&ab->b_state->list, ab);
+		ASSERT(ab->b_buf != NULL);
+		ab->b_state->lsize += ab->b_size;
+		mutex_exit(&ab->b_state->mtx);
+	}
+	return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state.  The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
+    kmutex_t *hash_lock)
+{
+	arc_buf_t *buf;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	/*
+	 * If this buffer is evictable, transfer it from the
+	 * old state list to the new state list.
+	 */
+	if (refcount_is_zero(&ab->b_refcnt)) {
+		if (ab->b_state != arc.anon) {
+			int drop_mutex = FALSE;
+
+			if (!MUTEX_HELD(&ab->b_state->mtx)) {
+				mutex_enter(&ab->b_state->mtx);
+				drop_mutex = TRUE;
+			}
+			ASSERT(list_link_active(&ab->b_arc_node));
+			list_remove(&ab->b_state->list, ab);
+			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+			ab->b_state->lsize -= ab->b_size;
+			if (drop_mutex)
+				mutex_exit(&ab->b_state->mtx);
+		}
+		if (new_state != arc.anon) {
+			int drop_mutex = FALSE;
+
+			if (!MUTEX_HELD(&new_state->mtx)) {
+				mutex_enter(&new_state->mtx);
+				drop_mutex = TRUE;
+			}
+			list_insert_head(&new_state->list, ab);
+			ASSERT(ab->b_buf != NULL);
+			new_state->lsize += ab->b_size;
+			if (drop_mutex)
+				mutex_exit(&new_state->mtx);
+		}
+	}
+
+	ASSERT(!BUF_EMPTY(ab));
+	if (new_state == arc.anon && ab->b_state != arc.anon) {
+		buf_hash_remove(ab);
+	}
+
+	/*
+	 * If this buffer isn't being transferred to the MRU-top
+	 * state, it's safe to clear its prefetch flag
+	 */
+	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+		ab->b_flags &= ~ARC_PREFETCH;
+	}
+
+	buf = ab->b_buf;
+	if (buf == NULL) {
+		ASSERT3U(ab->b_state->size, >=, ab->b_size);
+		atomic_add_64(&ab->b_state->size, -ab->b_size);
+		/* we should only be here if we are deleting state */
+		ASSERT(new_state == arc.anon &&
+		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
+	} else while (buf) {
+		ASSERT3U(ab->b_state->size, >=, ab->b_size);
+		atomic_add_64(&ab->b_state->size, -ab->b_size);
+		atomic_add_64(&new_state->size, ab->b_size);
+		buf = buf->b_next;
+	}
+	ab->b_state = new_state;
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag)
+{
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+
+	ASSERT3U(size, >, 0);
+	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+	ASSERT(BUF_EMPTY(hdr));
+	hdr->b_size = size;
+	hdr->b_spa = spa;
+	hdr->b_state = arc.anon;
+	hdr->b_arc_access = 0;
+	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+	buf->b_hdr = hdr;
+	buf->b_next = NULL;
+	buf->b_data = zio_buf_alloc(size);
+	hdr->b_buf = buf;
+	hdr->b_flags = 0;
+	ASSERT(refcount_is_zero(&hdr->b_refcnt));
+	(void) refcount_add(&hdr->b_refcnt, tag);
+
+	atomic_add_64(&arc.size, size);
+	atomic_add_64(&arc.anon->size, size);
+
+	return (buf);
+}
+
+static void
+arc_hdr_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(refcount_is_zero(&hdr->b_refcnt));
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+
+	if (!BUF_EMPTY(hdr)) {
+		/*
+		 * We can be called with an arc state lock held,
+		 * so we can't hold a hash lock here.
+		 * ASSERT(not in hash table)
+		 */
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		bzero(&hdr->b_dva, sizeof (dva_t));
+		hdr->b_birth = 0;
+		hdr->b_cksum0 = 0;
+	}
+	if (hdr->b_buf) {
+		arc_buf_t *buf = hdr->b_buf;
+
+		ASSERT3U(hdr->b_size, >, 0);
+		zio_buf_free(buf->b_data, hdr->b_size);
+		atomic_add_64(&arc.size, -hdr->b_size);
+		ASSERT3U(arc.anon->size, >=, hdr->b_size);
+		atomic_add_64(&arc.anon->size, -hdr->b_size);
+		ASSERT3P(buf->b_next, ==, NULL);
+		kmem_cache_free(buf_cache, buf);
+		hdr->b_buf = NULL;
+	}
+	ASSERT(!list_link_active(&hdr->b_arc_node));
+	ASSERT3P(hdr->b_hash_next, ==, NULL);
+	ASSERT3P(hdr->b_acb, ==, NULL);
+	kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	int freeable;
+
+	mutex_enter(hash_lock);
+	if (remove_reference(hdr, hash_lock, tag) > 0) {
+		arc_buf_t **bufp = &hdr->b_buf;
+		arc_state_t *state = hdr->b_state;
+		uint64_t size = hdr->b_size;
+
+		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
+		while (*bufp != buf) {
+			ASSERT(*bufp);
+			bufp = &(*bufp)->b_next;
+		}
+		*bufp = buf->b_next;
+		mutex_exit(hash_lock);
+		zio_buf_free(buf->b_data, size);
+		atomic_add_64(&arc.size, -size);
+		kmem_cache_free(buf_cache, buf);
+		ASSERT3U(state->size, >=, size);
+		atomic_add_64(&state->size, -size);
+		return;
+	}
+
+	/* don't free buffers that are in the middle of an async write */
+	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
+	mutex_exit(hash_lock);
+
+	if (freeable)
+		arc_hdr_free(hdr);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+	return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes.  Move the removed buffers to the appropriate evict state.
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, int64_t bytes)
+{
+	arc_state_t *evicted_state;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *ab, *ab_prev;
+	kmutex_t *hash_lock;
+
+	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+	if (state == arc.mru_top)
+		evicted_state = arc.mru_bot;
+	else
+		evicted_state = arc.mfu_bot;
+
+	mutex_enter(&state->mtx);
+	mutex_enter(&evicted_state->mtx);
+
+	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+		ab_prev = list_prev(&state->list, ab);
+		hash_lock = HDR_LOCK(ab);
+		if (mutex_tryenter(hash_lock)) {
+			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+			arc_change_state(evicted_state, ab, hash_lock);
+			zio_buf_free(ab->b_buf->b_data, ab->b_size);
+			atomic_add_64(&arc.size, -ab->b_size);
+			ASSERT3P(ab->b_buf->b_next, ==, NULL);
+			kmem_cache_free(buf_cache, ab->b_buf);
+			ab->b_buf = NULL;
+			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+			bytes_evicted += ab->b_size;
+			mutex_exit(hash_lock);
+			if (bytes_evicted >= bytes)
+				break;
+		} else {
+			atomic_add_64(&arc.skipped, 1);
+		}
+	}
+	mutex_exit(&evicted_state->mtx);
+	mutex_exit(&state->mtx);
+
+	if (bytes_evicted < bytes)
+		dprintf("only evicted %lld bytes from %x",
+		    (longlong_t)bytes_evicted, state);
+
+	return (bytes_evicted);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes.  Destroy the buffers that are removed.
+ */
+static void
+arc_delete_state(arc_state_t *state, int64_t bytes)
+{
+	uint_t bufs_skipped = 0;
+	uint64_t bytes_deleted = 0;
+	arc_buf_hdr_t *ab, *ab_prev;
+	kmutex_t *hash_lock;
+
+top:
+	mutex_enter(&state->mtx);
+	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+		ab_prev = list_prev(&state->list, ab);
+		hash_lock = HDR_LOCK(ab);
+		if (mutex_tryenter(hash_lock)) {
+			arc_change_state(arc.anon, ab, hash_lock);
+			mutex_exit(hash_lock);
+			atomic_add_64(&arc.deleted, 1);
+			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+			bytes_deleted += ab->b_size;
+			arc_hdr_free(ab);
+			if (bytes >= 0 && bytes_deleted >= bytes)
+				break;
+		} else {
+			if (bytes < 0) {
+				mutex_exit(&state->mtx);
+				mutex_enter(hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+			bufs_skipped += 1;
+		}
+	}
+	mutex_exit(&state->mtx);
+
+	if (bufs_skipped) {
+		atomic_add_64(&arc.skipped, bufs_skipped);
+		ASSERT(bytes >= 0);
+	}
+
+	if (bytes_deleted < bytes)
+		dprintf("only deleted %lld bytes from %p",
+		    (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+	int64_t top_sz, mru_over, arc_over;
+
+	top_sz = arc.anon->size + arc.mru_top->size;
+
+	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
+		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
+		(void) arc_evict_state(arc.mru_top, toevict);
+		top_sz = arc.anon->size + arc.mru_top->size;
+	}
+
+	mru_over = top_sz + arc.mru_bot->size - arc.c;
+
+	if (mru_over > 0) {
+		if (arc.mru_bot->lsize > 0) {
+			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
+			arc_delete_state(arc.mru_bot, todelete);
+		}
+	}
+
+	if ((arc_over = arc.size - arc.c) > 0) {
+		int64_t table_over;
+
+		if (arc.mfu_top->lsize > 0) {
+			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
+			(void) arc_evict_state(arc.mfu_top, toevict);
+		}
+
+		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
+		    - arc.c*2;
+
+		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
+			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
+			arc_delete_state(arc.mfu_bot, todelete);
+		}
+	}
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(void)
+{
+	arc_delete_state(arc.mru_top, -1);
+	arc_delete_state(arc.mfu_top, -1);
+
+	arc_delete_state(arc.mru_bot, -1);
+	arc_delete_state(arc.mfu_bot, -1);
+}
+
+void
+arc_kmem_reclaim(void)
+{
+	/* Remove 6.25% */
+	/*
+	 * We need arc_reclaim_lock because we don't want multiple
+	 * threads trying to reclaim concurrently.
+	 */
+
+	/*
+	 * umem calls the reclaim func when we destroy the buf cache,
+	 * which is after we do arc_fini().  So we set a flag to prevent
+	 * accessing the destroyed mutexes and lists.
+	 */
+	if (arc_dead)
+		return;
+
+	mutex_enter(&arc_reclaim_lock);
+
+	atomic_add_64(&arc.c, -(arc.c >> 4));
+	if (arc.c < arc.c_min)
+		arc.c = arc.c_min;
+	atomic_add_64(&arc.p, -(arc.p >> 4));
+
+	arc_adjust();
+
+	/* Cool it for a while */
+	arc.incr = 0;
+	arc.size_check = arc_size_check_default << 3;
+
+	mutex_exit(&arc_reclaim_lock);
+}
+
+static int
+arc_reclaim_needed(void)
+{
+	uint64_t extra;
+
+#ifdef _KERNEL
+	/*
+	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
+	 */
+	extra = desfree;
+
+	/*
+	 * check that we're out of range of the pageout scanner.  It starts to
+	 * schedule paging if freemem is less than lotsfree and needfree.
+	 * lotsfree is the high-water mark for pageout, and needfree is the
+	 * number of needed free pages.  We add extra pages here to make sure
+	 * the scanner doesn't start up while we're freeing memory.
+	 */
+	if (freemem < lotsfree + needfree + extra)
+		return (1);
+
+	/*
+	 * check to make sure that swapfs has enough space so that anon
+	 * reservations can still succeeed. anon_resvmem() checks that the
+	 * availrmem is greater than swapfs_minfree, and the number of reserved
+	 * swap pages.  We also add a bit of extra here just to prevent
+	 * circumstances from getting really dire.
+	 */
+	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+		return (1);
+
+	/*
+	 * If we're on an i386 platform, it's possible that we'll exhaust the
+	 * kernel heap space before we ever run out of available physical
+	 * memory.  Most checks of the size of the heap_area compare against
+	 * tune.t_minarmem, which is the minimum available real memory that we
+	 * can have in the system.  However, this is generally fixed at 25 pages
+	 * which is so low that it's useless.  In this comparison, we seek to
+	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
+	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
+	 * free)
+	 */
+#if defined(__i386)
+	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+		return (1);
+#endif
+
+#else
+	if (spa_get_random(100) == 0)
+		return (1);
+#endif
+	return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+	size_t			i;
+	kmem_cache_t		*prev_cache = NULL;
+	extern kmem_cache_t	*zio_buf_cache[];
+
+	/*
+	 * an agressive reclamation will shrink the cache size as well as reap
+	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
+	 * header-cache is reaped, so we only reap the header cache if we're
+	 * performing an agressive reclaim.  If we're not, just clean the kmem
+	 * buffer caches.
+	 */
+	if (strat == ARC_RECLAIM_AGGR)
+		kmem_cache_reap_now(hdr_cache);
+
+	kmem_cache_reap_now(buf_cache);
+
+	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+		if (zio_buf_cache[i] != prev_cache) {
+			prev_cache = zio_buf_cache[i];
+			kmem_cache_reap_now(zio_buf_cache[i]);
+		}
+	}
+}
+
+static void
+arc_reclaim_thread(void)
+{
+	clock_t			growtime = 0;
+	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
+	callb_cpr_t		cpr;
+
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&arc_reclaim_thr_lock);
+	while (arc_thread_exit == 0) {
+		if (arc_reclaim_needed()) {
+
+			if (arc.no_grow) {
+				if (last_reclaim == ARC_RECLAIM_CONS) {
+					last_reclaim = ARC_RECLAIM_AGGR;
+				} else {
+					last_reclaim = ARC_RECLAIM_CONS;
+				}
+			} else {
+				arc.no_grow = TRUE;
+				last_reclaim = ARC_RECLAIM_AGGR;
+				membar_producer();
+			}
+
+			/* reset the growth delay for every reclaim */
+			growtime = lbolt + (arc_grow_retry * hz);
+
+			arc_kmem_reap_now(last_reclaim);
+
+		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
+			arc.no_grow = FALSE;
+		}
+
+		/* block until needed, or one second, whichever is shorter */
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait(&arc_reclaim_thr_cv,
+		    &arc_reclaim_thr_lock, (lbolt + hz));
+		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+	}
+
+	arc_thread_exit = 0;
+	cv_broadcast(&arc_reclaim_thr_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	thread_exit();
+}
+
+static void
+arc_try_grow(int64_t bytes)
+{
+	/*
+	 * If we're within (2 * maxblocksize) bytes of the target
+	 * cache size, increment the target cache size
+	 */
+	atomic_add_64((uint64_t *)&arc.size_check, 1);
+
+	if (arc_reclaim_needed()) {
+		cv_signal(&arc_reclaim_thr_cv);
+		return;
+	}
+
+	if (arc.no_grow)
+		return;
+
+	/*
+	 * return true if we successfully grow, or if there's enough space that
+	 * we don't have to grow.  Above, we return false if we can't grow, or
+	 * if we shouldn't because a reclaim is in progress.
+	 */
+	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
+		if (arc.size_check > 0) {
+			arc.size_check = arc_size_check_default;
+			atomic_add_64(&arc.incr, arc_incr_size);
+		}
+		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+		if (arc.c > arc.c_max)
+			arc.c = arc.c_max;
+		else
+			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+	} else if (arc.size > arc.c) {
+		if (arc.size_check > 0) {
+			arc.size_check = arc_size_check_default;
+			atomic_add_64(&arc.incr, arc_incr_size);
+		}
+		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+		if (arc.c > arc.c_max)
+			arc.c = arc.c_max;
+		else
+			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+	}
+}
+
+/*
+ * check if the cache has reached its limits and eviction is required prior to
+ * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
+ * cache is either big enough that we can insert, or a arc_try_grow will result
+ * in more space being made available.
+ */
+
+static int
+arc_evict_needed()
+{
+
+	if (arc_reclaim_needed())
+		return (1);
+
+	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
+		return (1);
+
+	return (0);
+}
+
+/*
+ * The state, supplied as the first argument, is going to have something
+ * inserted on its behalf. So, determine which cache must be victimized to
+ * satisfy an insertion for this state.  We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead.  Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU.  In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * MFU's resident set is consuming more space than it has been allotted.  In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_evict_for_state(arc_state_t *state, uint64_t bytes)
+{
+	uint64_t	mru_used;
+	uint64_t	mfu_space;
+	uint64_t	evicted;
+
+	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+	if (state == arc.mru_top) {
+		mru_used = arc.anon->size + arc.mru_top->size;
+		if (arc.p > mru_used) {
+			/* case 1 */
+			evicted = arc_evict_state(arc.mfu_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		} else {
+			/* case 2 */
+			evicted = arc_evict_state(arc.mru_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		}
+	} else {
+		/* MFU_top case */
+		mfu_space = arc.c - arc.p;
+		if (mfu_space > arc.mfu_top->size) {
+			/* case 3 */
+			evicted = arc_evict_state(arc.mru_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		} else {
+			/* case 4 */
+			evicted = arc_evict_state(arc.mfu_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		}
+	}
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+	int		blksz, mult;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	blksz = buf->b_size;
+
+	if (buf->b_state == arc.anon) {
+		/*
+		 * This buffer is not in the cache, and does not
+		 * appear in our "ghost" list.  Add the new buffer
+		 * to the MRU state.
+		 */
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(arc.mru_top, blksz);
+		}
+
+		ASSERT(buf->b_arc_access == 0);
+		buf->b_arc_access = lbolt;
+		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
+		    buf);
+		arc_change_state(arc.mru_top, buf, hash_lock);
+
+		/*
+		 * If we are using less than 2/3 of our total target
+		 * cache size, bump up the target size for the MRU
+		 * list.
+		 */
+		if (arc.size < arc.c*2/3) {
+			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
+		}
+
+	} else if (buf->b_state == arc.mru_top) {
+		/*
+		 * If this buffer is in the MRU-top state and has the prefetch
+		 * flag, the first read was actually part of a prefetch.  In
+		 * this situation, we simply want to clear the flag and return.
+		 * A subsequent access should bump this into the MFU state.
+		 */
+		if ((buf->b_flags & ARC_PREFETCH) != 0) {
+			buf->b_flags &= ~ARC_PREFETCH;
+			atomic_add_64(&arc.mru_top->hits, 1);
+			return;
+		}
+
+		/*
+		 * This buffer has been "accessed" only once so far,
+		 * but it is still in the cache. Move it to the MFU
+		 * state.
+		 */
+		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+			/*
+			 * More than 125ms have passed since we
+			 * instantiated this buffer.  Move it to the
+			 * most frequently used state.
+			 */
+			buf->b_arc_access = lbolt;
+			DTRACE_PROBE1(new_state__mfu_top,
+			    arc_buf_hdr_t *, buf);
+			arc_change_state(arc.mfu_top, buf, hash_lock);
+		}
+		atomic_add_64(&arc.mru_top->hits, 1);
+	} else if (buf->b_state == arc.mru_bot) {
+		arc_state_t	*new_state;
+		/*
+		 * This buffer has been "accessed" recently, but
+		 * was evicted from the cache.  Move it to the
+		 * MFU state.
+		 */
+
+		if (buf->b_flags & ARC_PREFETCH) {
+			new_state = arc.mru_top;
+			DTRACE_PROBE1(new_state__mru_top,
+			    arc_buf_hdr_t *, buf);
+		} else {
+			new_state = arc.mfu_top;
+			DTRACE_PROBE1(new_state__mfu_top,
+			    arc_buf_hdr_t *, buf);
+		}
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(new_state, blksz);
+		}
+
+		/* Bump up the target size of the MRU list */
+		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
+		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
+		arc.p = MIN(arc.c, arc.p + blksz * mult);
+
+		buf->b_arc_access = lbolt;
+		arc_change_state(new_state, buf, hash_lock);
+
+		atomic_add_64(&arc.mru_bot->hits, 1);
+	} else if (buf->b_state == arc.mfu_top) {
+		/*
+		 * This buffer has been accessed more than once and is
+		 * still in the cache.  Keep it in the MFU state.
+		 *
+		 * NOTE: the add_reference() that occurred when we did
+		 * the arc_read() should have kicked this off the list,
+		 * so even if it was a prefetch, it will be put back at
+		 * the head of the list when we remove_reference().
+		 */
+		atomic_add_64(&arc.mfu_top->hits, 1);
+	} else if (buf->b_state == arc.mfu_bot) {
+		/*
+		 * This buffer has been accessed more than once but has
+		 * been evicted from the cache.  Move it back to the
+		 * MFU state.
+		 */
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(arc.mfu_top, blksz);
+		}
+
+		/* Bump up the target size for the MFU list */
+		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
+		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
+		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+
+		buf->b_arc_access = lbolt;
+		DTRACE_PROBE1(new_state__mfu_top,
+		    arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mfu_top, buf, hash_lock);
+
+		atomic_add_64(&arc.mfu_bot->hits, 1);
+	} else {
+		ASSERT(!"invalid arc state");
+	}
+
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+	arc_buf_free(buf, arg);
+}
+
+/* a generic arc_done_func_t which you can use */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	arc_buf_t **bufp = arg;
+	if (zio && zio->io_error) {
+		arc_buf_free(buf, arg);
+		*bufp = NULL;
+	} else {
+		*bufp = buf;
+	}
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+	arc_buf_hdr_t	*hdr;
+	arc_buf_t	*buf;
+	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
+	kmutex_t	*hash_lock;
+	arc_callback_t	*callback_list, *acb;
+	int		freeable = FALSE;
+
+	buf = zio->io_private;
+	hdr = buf->b_hdr;
+
+	if (!HDR_FREED_IN_READ(hdr)) {
+		arc_buf_hdr_t *found;
+
+		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+		    &hash_lock);
+
+		/*
+		 * Buffer was inserted into hash-table and removed from lists
+		 * prior to starting I/O.  We should find this header, since
+		 * it's in the hash table, and it should be legit since it's
+		 * not possible to evict it during the I/O.
+		 */
+
+		ASSERT(found);
+		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
+	}
+
+	/* byteswap if necessary */
+	callback_list = hdr->b_acb;
+	ASSERT(callback_list != NULL);
+	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+	/* create copies of the data buffer for the callers */
+	abuf = buf;
+	for (acb = callback_list; acb; acb = acb->acb_next) {
+		if (acb->acb_done) {
+			if (abuf == NULL) {
+				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+				abuf->b_data = zio_buf_alloc(hdr->b_size);
+				atomic_add_64(&arc.size, hdr->b_size);
+				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+				abuf->b_hdr = hdr;
+				abuf->b_next = hdr->b_buf;
+				hdr->b_buf = abuf;
+				atomic_add_64(&hdr->b_state->size, hdr->b_size);
+			}
+			acb->acb_buf = abuf;
+			abuf = NULL;
+		} else {
+			/*
+			 * The caller did not provide a callback function.
+			 * In this case, we should just remove the reference.
+			 */
+			if (HDR_FREED_IN_READ(hdr)) {
+				ASSERT3P(hdr->b_state, ==, arc.anon);
+				(void) refcount_remove(&hdr->b_refcnt,
+				    acb->acb_private);
+			} else {
+				(void) remove_reference(hdr, hash_lock,
+				    acb->acb_private);
+			}
+		}
+	}
+	hdr->b_acb = NULL;
+	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+
+	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+	if (zio->io_error != 0) {
+		hdr->b_flags |= ARC_IO_ERROR;
+		if (hdr->b_state != arc.anon)
+			arc_change_state(arc.anon, hdr, hash_lock);
+		freeable = refcount_is_zero(&hdr->b_refcnt);
+	}
+
+	if (!HDR_FREED_IN_READ(hdr)) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		if (zio->io_error == 0 && hdr->b_state == arc.anon)
+			arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+	} else {
+		/*
+		 * This block was freed while we waited for the read to
+		 * complete.  It has been removed from the hash table and
+		 * moved to the anonymous state (so that it won't show up
+		 * in the cache).
+		 */
+		ASSERT3P(hdr->b_state, ==, arc.anon);
+		freeable = refcount_is_zero(&hdr->b_refcnt);
+	}
+
+	cv_broadcast(&hdr->b_cv);
+
+	/* execute each callback and free its structure */
+	while ((acb = callback_list) != NULL) {
+		if (acb->acb_done)
+			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+		if (acb->acb_zio_dummy != NULL) {
+			acb->acb_zio_dummy->io_error = zio->io_error;
+			zio_nowait(acb->acb_zio_dummy);
+		}
+
+		callback_list = acb->acb_next;
+		kmem_free(acb, sizeof (arc_callback_t));
+	}
+
+	if (freeable)
+		arc_hdr_free(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache.  If the block is found in the cache, invoke the provided
+ * callback immediately and return.  Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required.  If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags)
+{
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+	kmutex_t *hash_lock;
+	zio_t	*rzio;
+
+top:
+	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	if (hdr && hdr->b_buf) {
+
+		ASSERT((hdr->b_state == arc.mru_top) ||
+		    (hdr->b_state == arc.mfu_top) ||
+		    ((hdr->b_state == arc.anon) &&
+		    (HDR_IO_IN_PROGRESS(hdr))));
+
+		if (HDR_IO_IN_PROGRESS(hdr)) {
+
+			if ((arc_flags & ARC_NOWAIT) && done) {
+				arc_callback_t	*acb = NULL;
+
+				acb = kmem_zalloc(sizeof (arc_callback_t),
+				    KM_SLEEP);
+				acb->acb_done = done;
+				acb->acb_private = private;
+				acb->acb_byteswap = swap;
+				if (pio != NULL)
+					acb->acb_zio_dummy = zio_null(pio,
+					    spa, NULL, NULL, flags);
+
+				ASSERT(acb->acb_done != NULL);
+				acb->acb_next = hdr->b_acb;
+				hdr->b_acb = acb;
+				add_reference(hdr, hash_lock, private);
+				mutex_exit(hash_lock);
+				return (0);
+			} else if (arc_flags & ARC_WAIT) {
+				cv_wait(&hdr->b_cv, hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+
+			mutex_exit(hash_lock);
+			return (0);
+		}
+
+		/*
+		 * If there is already a reference on this block, create
+		 * a new copy of the data so that we will be guaranteed
+		 * that arc_release() will always succeed.
+		 */
+
+		if (done)
+			add_reference(hdr, hash_lock, private);
+		if (done && refcount_count(&hdr->b_refcnt) > 1) {
+			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
+			atomic_add_64(&arc.size, hdr->b_size);
+			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
+			buf->b_hdr = hdr;
+			buf->b_next = hdr->b_buf;
+			hdr->b_buf = buf;
+			atomic_add_64(&hdr->b_state->size, hdr->b_size);
+		} else {
+			buf = hdr->b_buf;
+		}
+		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+		arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+		atomic_add_64(&arc.hits, 1);
+		if (done)
+			done(NULL, buf, private);
+	} else {
+		uint64_t size = BP_GET_LSIZE(bp);
+		arc_callback_t	*acb;
+
+		if (hdr == NULL) {
+			/* this block is not in the cache */
+			arc_buf_hdr_t	*exists;
+
+			buf = arc_buf_alloc(spa, size, private);
+			hdr = buf->b_hdr;
+			hdr->b_dva = *BP_IDENTITY(bp);
+			hdr->b_birth = bp->blk_birth;
+			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+			exists = buf_hash_insert(hdr, &hash_lock);
+			if (exists) {
+				/* somebody beat us to the hash insert */
+				mutex_exit(hash_lock);
+				bzero(&hdr->b_dva, sizeof (dva_t));
+				hdr->b_birth = 0;
+				hdr->b_cksum0 = 0;
+				arc_buf_free(buf, private);
+				goto top; /* restart the IO request */
+			}
+
+		} else {
+			/* this block is in the ghost cache */
+			ASSERT((hdr->b_state == arc.mru_bot) ||
+			    (hdr->b_state == arc.mfu_bot));
+			add_reference(hdr, hash_lock, private);
+
+			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			atomic_add_64(&arc.size, hdr->b_size);
+			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+			buf->b_hdr = hdr;
+			buf->b_next = NULL;
+			hdr->b_buf = buf;
+		}
+
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		acb->acb_done = done;
+		acb->acb_private = private;
+		acb->acb_byteswap = swap;
+
+		ASSERT(hdr->b_acb == NULL);
+		hdr->b_acb = acb;
+
+		/*
+		 * If this DVA is part of a prefetch, mark the buf
+		 * header with the prefetch flag
+		 */
+		if (arc_flags & ARC_PREFETCH)
+			hdr->b_flags |= ARC_PREFETCH;
+		hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+		/*
+		 * If the buffer has been evicted, migrate it to a present state
+		 * before issuing the I/O.  Once we drop the hash-table lock,
+		 * the header will be marked as I/O in progress and have an
+		 * attached buffer.  At this point, anybody who finds this
+		 * buffer ought to notice that it's legit but has a pending I/O.
+		 */
+
+		if ((hdr->b_state == arc.mru_bot) ||
+		    (hdr->b_state == arc.mfu_bot))
+			arc_access(hdr, hash_lock);
+
+		mutex_exit(hash_lock);
+
+		ASSERT3U(hdr->b_size, ==, size);
+		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
+		    uint64_t, size);
+		atomic_add_64(&arc.misses, 1);
+		rzio = zio_read(pio, spa, bp, buf->b_data, size,
+		    arc_read_done, buf, priority, flags);
+
+		if (arc_flags & ARC_WAIT)
+			return (zio_wait(rzio));
+
+		ASSERT(arc_flags & ARC_NOWAIT);
+		zio_nowait(rzio);
+	}
+	return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal.  If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_mtx;
+	int rc = 0;
+
+	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
+		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
+	else
+		rc = ENOENT;
+
+	if (hash_mtx)
+		mutex_exit(hash_mtx);
+
+	return (rc);
+}
+
+/*
+ * Release this buffer from the cache.  This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+	/* this buffer is not on any list */
+	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+	if (hdr->b_state == arc.anon) {
+		/* this buffer is already released */
+		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+		ASSERT(BUF_EMPTY(hdr));
+		return;
+	}
+
+	mutex_enter(hash_lock);
+
+	if (refcount_count(&hdr->b_refcnt) > 1) {
+		arc_buf_hdr_t *nhdr;
+		arc_buf_t **bufp;
+		uint64_t blksz = hdr->b_size;
+		spa_t *spa = hdr->b_spa;
+
+		/*
+		 * Pull the data off of this buf and attach it to
+		 * a new anonymous buf.
+		 */
+		bufp = &hdr->b_buf;
+		while (*bufp != buf) {
+			ASSERT(*bufp);
+			bufp = &(*bufp)->b_next;
+		}
+		*bufp = (*bufp)->b_next;
+		(void) refcount_remove(&hdr->b_refcnt, tag);
+		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
+		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+		mutex_exit(hash_lock);
+
+		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+		nhdr->b_size = blksz;
+		nhdr->b_spa = spa;
+		nhdr->b_buf = buf;
+		nhdr->b_state = arc.anon;
+		nhdr->b_arc_access = 0;
+		nhdr->b_flags = 0;
+		buf->b_hdr = nhdr;
+		buf->b_next = NULL;
+		(void) refcount_add(&nhdr->b_refcnt, tag);
+		atomic_add_64(&arc.anon->size, blksz);
+
+		hdr = nhdr;
+	} else {
+		ASSERT(!list_link_active(&hdr->b_arc_node));
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		arc_change_state(arc.anon, hdr, hash_lock);
+		hdr->b_arc_access = 0;
+		mutex_exit(hash_lock);
+		bzero(&hdr->b_dva, sizeof (dva_t));
+		hdr->b_birth = 0;
+		hdr->b_cksum0 = 0;
+	}
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+	return (buf->b_hdr->b_state == arc.anon);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+	arc_buf_t *buf;
+	arc_buf_hdr_t *hdr;
+	arc_callback_t *acb;
+
+	buf = zio->io_private;
+	hdr = buf->b_hdr;
+	acb = hdr->b_acb;
+	hdr->b_acb = NULL;
+
+	/* this buffer is on no lists and is not in the hash table */
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+
+	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+	hdr->b_birth = zio->io_bp->blk_birth;
+	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	/* clear the "in-write" flag */
+	hdr->b_hash_next = NULL;
+	/* This write may be all-zero */
+	if (!BUF_EMPTY(hdr)) {
+		arc_buf_hdr_t *exists;
+		kmutex_t *hash_lock;
+
+		exists = buf_hash_insert(hdr, &hash_lock);
+		if (exists) {
+			/*
+			 * This can only happen if we overwrite for
+			 * sync-to-convergence, because we remove
+			 * buffers from the hash table when we arc_free().
+			 */
+			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+			    BP_IDENTITY(zio->io_bp)));
+			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+			    zio->io_bp->blk_birth);
+
+			ASSERT(refcount_is_zero(&exists->b_refcnt));
+			arc_change_state(arc.anon, exists, hash_lock);
+			mutex_exit(hash_lock);
+			arc_hdr_free(exists);
+			exists = buf_hash_insert(hdr, &hash_lock);
+			ASSERT3P(exists, ==, NULL);
+		}
+		arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+	}
+	if (acb && acb->acb_done) {
+		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+		acb->acb_done(zio, buf, acb->acb_private);
+	}
+
+	if (acb)
+		kmem_free(acb, sizeof (arc_callback_t));
+}
+
+int
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	arc_callback_t	*acb;
+	zio_t	*rzio;
+
+	/* this is a private buffer - no locking required */
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+	ASSERT(BUF_EMPTY(hdr));
+	ASSERT(!HDR_IO_ERROR(hdr));
+	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+	acb->acb_done = done;
+	acb->acb_private = private;
+	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
+	hdr->b_acb = acb;
+	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+
+	if (arc_flags & ARC_WAIT)
+		return (zio_wait(rzio));
+
+	ASSERT(arc_flags & ARC_NOWAIT);
+	zio_nowait(rzio);
+
+	return (0);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+	arc_buf_hdr_t *ab;
+	kmutex_t *hash_lock;
+	zio_t	*zio;
+
+	/*
+	 * If this buffer is in the cache, release it, so it
+	 * can be re-used.
+	 */
+	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	if (ab != NULL) {
+		/*
+		 * The checksum of blocks to free is not always
+		 * preserved (eg. on the deadlist).  However, if it is
+		 * nonzero, it should match what we have in the cache.
+		 */
+		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+		arc_change_state(arc.anon, ab, hash_lock);
+		if (refcount_is_zero(&ab->b_refcnt)) {
+			mutex_exit(hash_lock);
+			arc_hdr_free(ab);
+			atomic_add_64(&arc.deleted, 1);
+		} else {
+			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+			if (HDR_IO_IN_PROGRESS(ab))
+				ab->b_flags |= ARC_FREED_IN_READ;
+			ab->b_arc_access = 0;
+			bzero(&ab->b_dva, sizeof (dva_t));
+			ab->b_birth = 0;
+			ab->b_cksum0 = 0;
+			mutex_exit(hash_lock);
+		}
+	}
+
+	zio = zio_free(pio, spa, txg, bp, done, private);
+
+	if (arc_flags & ARC_WAIT)
+		return (zio_wait(zio));
+
+	ASSERT(arc_flags & ARC_NOWAIT);
+	zio_nowait(zio);
+
+	return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t tempreserve)
+{
+	atomic_add_64(&arc_tempreserve, -tempreserve);
+	ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t tempreserve)
+{
+#ifdef ZFS_DEBUG
+	/*
+	 * Once in a while, fail for no reason.  Everything should cope.
+	 */
+	if (spa_get_random(10000) == 0) {
+		dprintf("forcing random failure\n");
+		return (ERESTART);
+	}
+#endif
+	/*
+	 * XXX This is kind of hacky.  The limit should be adjusted
+	 * dynamically to keep the time to sync a dataset fixed (around
+	 * 1-5 seconds?).
+	 * Maybe should have some sort of locking?  If two requests come
+	 * in concurrently, we might let them both succeed, when one of
+	 * them should fail.  Not a huge deal.
+	 */
+
+	ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */
+
+	if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) {
+		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
+		    "tempreserve=%lluK arc.c=%lluK\n",
+		    arc_tempreserve>>10, arc.anon->lsize>>10,
+		    tempreserve>>10, arc.c>>10);
+		return (ERESTART);
+	}
+	atomic_add_64(&arc_tempreserve, tempreserve);
+	return (0);
+}
+
+void
+arc_init(void)
+{
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+	/* Start out with 1/8 of all memory */
+	arc.c = physmem * PAGESIZE / 8;
+
+#ifdef _KERNEL
+	/*
+	 * On architectures where the physical memory can be larger
+	 * than the addressable space (intel in 32-bit mode), we may
+	 * need to limit the cache to 1/8 of VM size.
+	 */
+	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+
+	/* use at least 1/32 of all memory, or 32MB, whichever is more */
+	arc.c_min = MAX(arc.c / 4, 64<<20);
+	/* use at most 3/4 of all memory, or all but 1GB, whichever is more */
+	if (arc.c * 8 >= 1<<30)
+		arc.c_max = (arc.c * 8) - (1<<30);
+	else
+		arc.c_max = arc.c_min;
+	arc.c_max = MAX(arc.c * 6, arc.c_max);
+	arc.c = arc.c_max;
+	arc.p = (arc.c >> 1);
+
+	/* if kmem_flags are set, lets try to use less memory */
+	if (kmem_debugging())
+		arc.c = arc.c / 2;
+	if (arc.c < arc.c_min)
+		arc.c = arc.c_min;
+
+	arc.anon = &ARC_anon;
+	arc.mru_top = &ARC_mru_top;
+	arc.mru_bot = &ARC_mru_bot;
+	arc.mfu_top = &ARC_mfu_top;
+	arc.mfu_bot = &ARC_mfu_bot;
+
+	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+
+	buf_init();
+
+	arc_thread_exit = 0;
+
+	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+}
+
+void
+arc_fini(void)
+{
+	mutex_enter(&arc_reclaim_thr_lock);
+	arc_thread_exit = 1;
+	while (arc_thread_exit != 0)
+		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+	mutex_exit(&arc_reclaim_thr_lock);
+
+	arc_flush();
+
+	arc_dead = TRUE;
+
+	mutex_destroy(&arc_reclaim_lock);
+	mutex_destroy(&arc_reclaim_thr_lock);
+	cv_destroy(&arc_reclaim_thr_cv);
+
+	list_destroy(&arc.mru_top->list);
+	list_destroy(&arc.mru_bot->list);
+	list_destroy(&arc.mfu_top->list);
+	list_destroy(&arc.mfu_bot->list);
+
+	buf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
new file mode 100644
index 0000000000..68f79ac5a2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static void
+bplist_hold(bplist_t *bpl)
+{
+	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+	if (bpl->bpl_dbuf == NULL) {
+		bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
+		    bpl->bpl_object, bpl);
+		dmu_buf_read(bpl->bpl_dbuf);
+		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+	}
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+	uint64_t obj;
+
+	obj = dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+	    DMU_OT_BPLIST_HDR, sizeof (bplist_phys_t), tx);
+
+	return (obj);
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+	VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+void
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	VERIFY(dmu_object_info(mos, object, &doi) == 0);
+
+	mutex_enter(&bpl->bpl_lock);
+
+	ASSERT(bpl->bpl_dbuf == NULL);
+	ASSERT(bpl->bpl_phys == NULL);
+	ASSERT(bpl->bpl_cached_dbuf == NULL);
+	ASSERT(bpl->bpl_queue == NULL);
+	ASSERT(object != 0);
+
+	bpl->bpl_mos = mos;
+	bpl->bpl_object = object;
+	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+	mutex_enter(&bpl->bpl_lock);
+
+	ASSERT(bpl->bpl_queue == NULL);
+
+	if (bpl->bpl_cached_dbuf) {
+		dmu_buf_rele(bpl->bpl_cached_dbuf);
+		bpl->bpl_cached_dbuf = NULL;
+	}
+	if (bpl->bpl_dbuf) {
+		dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+		bpl->bpl_dbuf = NULL;
+		bpl->bpl_phys = NULL;
+	}
+
+	mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+	boolean_t rv;
+
+	if (bpl->bpl_object == 0)
+		return (B_TRUE);
+
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+	rv = (bpl->bpl_phys->bpl_entries == 0);
+	mutex_exit(&bpl->bpl_lock);
+
+	return (rv);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+	uint64_t blk, off;
+	blkptr_t *bparray;
+	dmu_buf_t *db;
+
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+
+	if (*itorp >= bpl->bpl_phys->bpl_entries) {
+		mutex_exit(&bpl->bpl_lock);
+		return (ENOENT);
+	}
+
+	blk = *itorp >> bpl->bpl_bpshift;
+	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+	db = bpl->bpl_cached_dbuf;
+
+	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+		if (db != NULL)
+			dmu_buf_rele(db);
+		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	}
+
+	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+	dmu_buf_read(db);
+	bparray = db->db_data;
+	*bp = bparray[off];
+	(*itorp)++;
+	mutex_exit(&bpl->bpl_lock);
+	return (0);
+}
+
+void
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+	uint64_t blk, off;
+	blkptr_t *bparray;
+	dmu_buf_t *db;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+
+	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+	db = bpl->bpl_cached_dbuf;
+
+	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+		if (db != NULL)
+			dmu_buf_rele(db);
+		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	}
+
+	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+	dmu_buf_will_dirty(db, tx);
+	bparray = db->db_data;
+	bparray[off] = *bp;
+
+	/* We never need the fill count. */
+	bparray[off].blk_fill = 0;
+
+	/* The bplist will compress better if we can leave off the checksum */
+	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+	bpl->bpl_phys->bpl_entries++;
+	bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
+	mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+	ASSERT(!BP_IS_HOLE(bp));
+	mutex_enter(&bpl->bpl_lock);
+	bpq->bpq_blk = *bp;
+	bpq->bpq_next = bpl->bpl_queue;
+	bpl->bpl_queue = bpq;
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+	bplist_q_t *bpq;
+
+	mutex_enter(&bpl->bpl_lock);
+	while ((bpq = bpl->bpl_queue) != NULL) {
+		bpl->bpl_queue = bpq->bpq_next;
+		mutex_exit(&bpl->bpl_lock);
+		bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+		kmem_free(bpq, sizeof (*bpq));
+		mutex_enter(&bpl->bpl_lock);
+	}
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+	mutex_enter(&bpl->bpl_lock);
+	ASSERT3P(bpl->bpl_queue, ==, NULL);
+	bplist_hold(bpl);
+	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+	dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+	bpl->bpl_phys->bpl_entries = 0;
+	bpl->bpl_phys->bpl_bytes = 0;
+	mutex_exit(&bpl->bpl_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 0000000000..e4b2d7f9e6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2022 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static void dbuf_verify(dmu_buf_impl_t *db);
+static void dbuf_evict_user(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static arc_done_func_t dbuf_read_done;
+static arc_done_func_t dbuf_write_done;
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+taskq_t *dbuf_tq;
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+	dmu_buf_impl_t *db = vdb;
+	bzero(db, sizeof (dmu_buf_impl_t));
+
+	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	refcount_create(&db->db_holds);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+	dmu_buf_impl_t *db = vdb;
+	mutex_destroy(&db->db_mtx);
+	cv_destroy(&db->db_changed);
+	refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+	uintptr_t osv = (uintptr_t)os;
+	uint64_t crc = -1ULL;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+	return (crc);
+}
+
+#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
+	((dbuf)->db.db_object == (obj) &&		\
+	(dbuf)->db_objset == (os) &&			\
+	(dbuf)->db_level == (level) &&			\
+	(dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t obj = dn->dn_object;
+	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *db;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+			mutex_enter(&db->db_mtx);
+			if (!refcount_is_zero(&db->db_holds)) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (db);
+			}
+			mutex_exit(&db->db_mtx);
+		}
+	}
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_impl_t *os = db->db_objset;
+	uint64_t obj = db->db.db_object;
+	int level = db->db_level;
+	uint64_t blkid = db->db_blkid;
+	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *dbf;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+			mutex_enter(&dbf->db_mtx);
+			if (!refcount_is_zero(&dbf->db_holds)) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (dbf);
+			}
+			mutex_exit(&dbf->db_mtx);
+		}
+	}
+
+	mutex_enter(&db->db_mtx);
+	db->db_hash_next = h->hash_table[idx];
+	h->hash_table[idx] = db;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, 1);
+
+	return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table.  This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *dbf, **dbp;
+
+	/*
+	 * We musn't hold db_mtx to maintin lock ordering:
+	 * DBUF_HASH_MUTEX > db_mtx.
+	 */
+	ASSERT(refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_dnode != NULL);
+	ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	dbp = &h->hash_table[idx];
+	while ((dbf = *dbp) != db) {
+		dbp = &dbf->db_hash_next;
+		ASSERT(dbf != NULL);
+	}
+	*dbp = db->db_hash_next;
+	db->db_hash_next = NULL;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static int dbuf_evictable(dmu_buf_impl_t *db);
+static void dbuf_clear(dmu_buf_impl_t *db);
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+	int err;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	err = dbuf_evictable(db);
+	ASSERT(err == TRUE);
+	dbuf_clear(db);
+	dbuf_destroy(db);
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
+		return;
+
+	if (db->db_d.db_user_data_ptr_ptr)
+		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+	db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
+	db->db_d.db_user_ptr = NULL;
+	db->db_d.db_user_data_ptr_ptr = NULL;
+	db->db_d.db_evict_func = NULL;
+}
+
+void
+dbuf_init(void)
+{
+	uint64_t hsize = 1;
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average 64k block size.  The table will take up
+	 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
+	 * pointers).
+	 */
+	while (hsize * 65536 < physmem * PAGESIZE)
+		hsize <<= 1;
+
+	h->hash_table_mask = hsize - 1;
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+
+	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+	    sizeof (dmu_buf_impl_t),
+	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+	dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
+	    TASKQ_PREPOPULATE);
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	taskq_destroy(dbuf_tq);
+	dbuf_tq = NULL;
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_destroy(&h->hash_mutexes[i]);
+	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+	kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+	int i;
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+		return;
+
+	ASSERT(db->db_objset != NULL);
+	if (dn == NULL) {
+		ASSERT(db->db_parent == NULL);
+		ASSERT(db->db_blkptr == NULL);
+	} else {
+		ASSERT3U(db->db.db_object, ==, dn->dn_object);
+		ASSERT3P(db->db_objset, ==, dn->dn_objset);
+		ASSERT(list_head(&dn->dn_dbufs));
+		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+	}
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+	} else {
+		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+	}
+
+	if (db->db_level == 0) {
+		void **udpp = db->db_d.db_user_data_ptr_ptr;
+		/* we can be momentarily larger in dnode_set_blksz() */
+		if (db->db_blkid != DB_BONUS_BLKID && dn) {
+			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+		}
+		if (udpp) {
+			ASSERT((refcount_is_zero(&db->db_holds) &&
+			    *udpp == NULL) ||
+			    (!refcount_is_zero(&db->db_holds) &&
+			    *udpp == db->db.db_data));
+		}
+
+		if (IS_DNODE_DNODE(db->db.db_object)) {
+			for (i = 0; i < TXG_SIZE; i++) {
+				/*
+				 * it should only be modified in syncing
+				 * context, so make sure we only have
+				 * one copy of the data.
+				 */
+				ASSERT(db->db_d.db_data_old[i] == NULL ||
+				    db->db_d.db_data_old[i] == db->db_buf);
+			}
+		}
+	}
+
+	/* verify db->db_blkptr */
+	if (db->db_blkptr) {
+		if (db->db_parent == dn->dn_dbuf) {
+			/* db is pointed to by the dnode */
+			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+			if (IS_DNODE_DNODE(db->db.db_object))
+				ASSERT(db->db_parent == NULL);
+			else
+				ASSERT(db->db_parent != NULL);
+			ASSERT3P(db->db_blkptr, ==,
+			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		} else {
+			/* db is pointed to by an indirect block */
+			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+			ASSERT3U(db->db_parent->db.db_object, ==,
+			    db->db.db_object);
+			/*
+			 * dnode_grow_indblksz() can make this fail if we don't
+			 * have the struct_rwlock.  XXX indblksz no longer
+			 * grows.  safe to do this now?
+			 */
+			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+				ASSERT3P(db->db_blkptr, ==,
+				    ((blkptr_t *)db->db_parent->db.db_data +
+				    db->db_blkid % epb));
+			}
+		}
+	}
+	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+	    db->db_state != DB_FILL && !dn->dn_free_txg) {
+		/*
+		 * If the blkptr isn't set but they have nonzero data,
+		 * it had better be dirty, otherwise we'll lose that
+		 * data when we evict this buffer.
+		 */
+		if (db->db_dirtycnt == 0) {
+			uint64_t *buf = db->db.db_data;
+			int i;
+
+			for (i = 0; i < db->db.db_size >> 3; i++) {
+				ASSERT(buf[i] == 0);
+			}
+		}
+	}
+#endif
+}
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
+		ASSERT(!refcount_is_zero(&db->db_holds));
+		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+	}
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(buf->b_data != NULL);
+	db->db_buf = buf;
+	db->db.db_data = buf->b_data;
+	dbuf_update_data(db);
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+	if (dn->dn_datablkshift) {
+		return (offset >> dn->dn_datablkshift);
+	} else {
+		ASSERT3U(offset, <, dn->dn_datablksz);
+		return (0);
+	}
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3U(db->db_state, ==, DB_READ);
+	/*
+	 * All reads are synchronous, so we must have a hold on the dbuf
+	 */
+	ASSERT(refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db.db_data == NULL);
+	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+		/* we were freed in flight; disregard any error */
+		arc_release(buf, db);
+		bzero(buf->b_data, db->db.db_size);
+		db->db_d.db_freed_in_flight = FALSE;
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else if (zio == NULL || zio->io_error == 0) {
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else {
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		arc_buf_free(buf, db);
+		db->db_state = DB_UNCACHED;
+		ASSERT3P(db->db_buf, ==, NULL);
+	}
+	cv_broadcast(&db->db_changed);
+	mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+	arc_buf_t *buf;
+	blkptr_t *bp;
+
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	/* We need the struct_rwlock to prevent db_blkptr from changing. */
+	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+
+	/*
+	 * prefetch only data blocks (level 0) -- don't prefetch indirect
+	 * blocks
+	 */
+	if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
+		flags |= DB_RF_NOPREFETCH;
+	}
+
+	if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
+		dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+		    db->db.db_size);
+	}
+
+	if (db->db_state == DB_CACHED) {
+		ASSERT(db->db.db_data != NULL);
+		return;
+	}
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_state != DB_UNCACHED) {
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+		buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    DN_MAX_BONUSLEN, db);
+		if (db->db.db_size < DN_MAX_BONUSLEN)
+			bzero(buf->b_data, DN_MAX_BONUSLEN);
+		bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+		    db->db.db_size);
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+		bp = NULL;
+	else
+		bp = db->db_blkptr;
+
+	if (bp == NULL)
+		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+	else
+		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+	if (bp == NULL || BP_IS_HOLE(bp)) {
+		ASSERT(bp == NULL || BP_IS_HOLE(bp));
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    db->db.db_size, db));
+		bzero(db->db.db_data, db->db.db_size);
+		db->db_state = DB_CACHED;
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	db->db_state = DB_READ;
+	mutex_exit(&db->db_mtx);
+
+	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+	    db->db_level > 0 ? byteswap_uint64_array :
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+	    ARC_NOWAIT);
+}
+
+static int
+dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+{
+	zio_t *zio;
+	int err;
+
+	/*
+	 * We don't have to hold the mutex to check db_state because it
+	 * can't be freed while we have a hold on the buffer.
+	 */
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	if (db->db_state == DB_CACHED)
+		return (0);
+
+	if (db->db_state == DB_UNCACHED) {
+		zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		dbuf_read_impl(db, zio, flags);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+		err = zio_wait(zio);
+		if (err)
+			return (err);
+	}
+
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL) {
+		ASSERT(db->db_state == DB_READ ||
+		    (flags & DB_RF_HAVESTRUCT) == 0);
+		cv_wait(&db->db_changed, &db->db_mtx);
+	}
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+	mutex_exit(&db->db_mtx);
+
+	return (0);
+}
+
+#pragma weak dmu_buf_read = dbuf_read
+void
+dbuf_read(dmu_buf_impl_t *db)
+{
+	int err;
+
+	err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
+	ASSERT(err == 0);
+}
+
+#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
+int
+dbuf_read_canfail(dmu_buf_impl_t *db)
+{
+	return (dbuf_read_generic(db, DB_RF_CANFAIL));
+}
+
+void
+dbuf_read_havestruct(dmu_buf_impl_t *db)
+{
+	int err;
+
+	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+	err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
+	ASSERT(err == 0);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+	if (db->db_state == DB_UNCACHED) {
+		int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
+		    DN_MAX_BONUSLEN : db->db.db_size;
+		ASSERT(db->db.db_data == NULL);
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    blksz, db));
+		db->db_state = DB_FILL;
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function.  It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	arc_buf_t **quiescing, **syncing;
+	int size = (db->db_blkid == DB_BONUS_BLKID) ?
+	    DN_MAX_BONUSLEN : db->db.db_size;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+
+	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+	/*
+	 * If this buffer is referenced from the current quiescing
+	 * transaction group: either make a copy and reset the reference
+	 * to point to the copy, or (if there a no active holders) just
+	 * null out the current db_data pointer.
+	 */
+	if (*quiescing == db->db_buf) {
+		/*
+		 * If the quiescing txg is "dirty", then we better not
+		 * be referencing the same buffer from the syncing txg.
+		 */
+		ASSERT(*syncing != db->db_buf);
+		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			*quiescing = arc_buf_alloc(
+			    db->db_dnode->dn_objset->os_spa, size, db);
+			bcopy(db->db.db_data, (*quiescing)->b_data, size);
+		} else {
+			db->db.db_data = NULL;
+			db->db_buf = NULL;
+			db->db_state = DB_UNCACHED;
+		}
+		return;
+	}
+
+	/*
+	 * If this buffer is referenced from the current syncing
+	 * transaction group: either
+	 *	1 - make a copy and reset the reference, or
+	 *	2 - if there are no holders, just null the current db_data.
+	 */
+	if (*syncing == db->db_buf) {
+		ASSERT3P(*quiescing, ==, NULL);
+		ASSERT3U(db->db_dirtycnt, ==, 1);
+		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			/* we can't copy if we have already started a write */
+			ASSERT(*syncing != db->db_data_pending);
+			*syncing = arc_buf_alloc(
+			    db->db_dnode->dn_objset->os_spa, size, db);
+			bcopy(db->db.db_data, (*syncing)->b_data, size);
+		} else {
+			db->db.db_data = NULL;
+			db->db_buf = NULL;
+			db->db_state = DB_UNCACHED;
+		}
+	}
+}
+
+void
+dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
+		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+	} else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+		/* free this block */
+		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
+		    db->db_dnode->dn_free_txg == txg);
+		if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
+			/* XXX can get silent EIO here */
+			(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+			    txg, db->db_d.db_overridden_by[txg&TXG_MASK],
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
+		    sizeof (blkptr_t));
+		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+		/* release the already-written buffer */
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+	}
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db, *db_next;
+	uint64_t txg = tx->tx_txg;
+
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+		db_next = list_next(&dn->dn_dbufs, db);
+		if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+			continue;
+		dprintf_dbuf(db, "found buf %s\n", "");
+		if (db->db_blkid < blkid ||
+		    db->db_blkid >= blkid+nblks)
+			continue;
+
+		/* found a level 0 buffer in the range */
+		if (dbuf_undirty(db, tx))
+			continue;
+
+		mutex_enter(&db->db_mtx);
+		if (db->db_state == DB_UNCACHED) {
+			ASSERT(db->db.db_data == NULL);
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_READ) {
+			/* this will be handled in dbuf_read_done() */
+			db->db_d.db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_FILL) {
+			/* this will be handled in dbuf_rele() */
+			db->db_d.db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+
+		/* make a copy of the data if necessary */
+		dbuf_fix_old_data(db, txg);
+
+		if (db->db.db_data) {
+			/* fill in with appropriate data */
+			arc_release(db->db_buf, db);
+			bzero(db->db.db_data, db->db.db_size);
+		}
+		mutex_exit(&db->db_mtx);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+	uint64_t birth_txg = 0;
+
+	/* Don't count meta-objects */
+	if (ds == NULL)
+		return (FALSE);
+
+	/*
+	 * We don't need any locking to protect db_blkptr:
+	 * If it's syncing, then db_dirtied will be set so we'll
+	 * ignore db_blkptr.
+	 */
+	ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
+	/* If we have been dirtied since the last snapshot, its not new */
+	if (db->db_dirtied)
+		birth_txg = db->db_dirtied;
+	else if (db->db_blkptr)
+		birth_txg = db->db_blkptr->blk_birth;
+
+	if (birth_txg)
+		return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+	else
+		return (TRUE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+	arc_buf_t *buf, *obuf;
+	int osize = db->db.db_size;
+
+	/* XXX does *this* func really need the lock? */
+	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+	ASSERT3U(osize, <=, size);
+	if (osize == size)
+		return;
+
+	/*
+	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+	 * is OK, because there can be no other references to the db
+	 * when we are changing its size, so no concurrent DB_FILL can
+	 * be happening.
+	 */
+	/* Make a copy of the data if necessary */
+	dbuf_will_dirty(db, tx);
+
+	/* create the data buffer for the new block */
+	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
+
+	/* copy old block data to the new block */
+	obuf = db->db_buf;
+	bcopy(obuf->b_data, buf->b_data, osize);
+	/* zero the remainder */
+	bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+	mutex_enter(&db->db_mtx);
+	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
+	dbuf_set_data(db, buf);
+	arc_buf_free(obuf, db);
+	db->db.db_size = size;
+
+	/* fix up the dirty info */
+	if (db->db_level == 0)
+		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
+	mutex_exit(&db->db_mtx);
+
+	dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+void
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	int drop_struct_lock = FALSE;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	dmu_tx_dirty_buf(tx, db);
+
+	/*
+	 * Shouldn't dirty a regular buffer in syncing context.  Private
+	 * objects may be dirtied in syncing context, but only if they
+	 * were already pre-dirtied in open context.
+	 * XXX We may want to prohibit dirtying in syncing context even
+	 * if they did pre-dirty.
+	 */
+	ASSERT(!(dmu_tx_is_syncing(tx) &&
+	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
+	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_objset->os_dsl_dataset != NULL &&
+	    !dsl_dir_is_private(
+	    dn->dn_objset->os_dsl_dataset->ds_dir)));
+
+	/*
+	 * We make this assert for private objects as well, but after we
+	 * check if we're already dirty.  They are allowed to re-dirty
+	 * in syncing context.
+	 */
+	ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED ||
+	    dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	mutex_enter(&db->db_mtx);
+	/* XXX make this true for indirects too? */
+	ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
+	    db->db_state == DB_FILL);
+
+	/*
+	 * If this buffer is currently part of an "overridden" region,
+	 * we now need to remove it from that region.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+	    db->db_d.db_overridden_by[txgoff] != NULL) {
+		dbuf_unoverride(db, tx->tx_txg);
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	/*
+	 * Don't set dirtyctx to SYNC if we're just modifying this as we
+	 * initialize the objset.
+	 */
+	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
+		dn->dn_dirtyctx =
+		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+		ASSERT(dn->dn_dirtyctx_firstset == NULL);
+		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If this buffer is already dirty, we're done.
+	 */
+	if (list_link_active(&db->db_dirty_node[txgoff])) {
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	/*
+	 * Only valid if not already dirty.
+	 */
+	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	ASSERT3U(dn->dn_nlevels, >, db->db_level);
+	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+	    dn->dn_phys->dn_nlevels > db->db_level ||
+	    dn->dn_next_nlevels[txgoff] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+	/*
+	 * We should only be dirtying in syncing context if it's the
+	 * mos, a spa os, or we're initializing the os.  However, we are
+	 * allowed to dirty in syncing context provided we already
+	 * dirtied it in open context.  Hence we must make this
+	 * assertion only if we're not already dirty.
+	 */
+	ASSERT(!dmu_tx_is_syncing(tx) ||
+	    os->os_dsl_dataset == NULL ||
+	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+	    !BP_IS_HOLE(&os->os_rootbp));
+	ASSERT(db->db.db_size != 0);
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	if (db->db_level == 0) {
+		/*
+		 * Release the data buffer from the cache so that we
+		 * can modify it without impacting possible other users
+		 * of this cached data block.  Note that indirect blocks
+		 * and private objects are not released until the syncing
+		 * state (since they are only modified then).
+		 *
+		 * If this buffer is dirty in an old transaction group we need
+		 * to make a copy of it so that the changes we make in this
+		 * transaction group won't leak out when we sync the older txg.
+		 */
+		ASSERT(db->db_buf != NULL);
+		ASSERT(db->db.db_data != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+		if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+			arc_release(db->db_buf, db);
+			dbuf_fix_old_data(db, tx->tx_txg);
+			ASSERT(db->db_buf != NULL);
+		}
+		db->db_d.db_data_old[txgoff] = db->db_buf;
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	/*
+	 * We could have been freed_in_flight between the dbuf_noread
+	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
+	 * happened after the free.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		db->db_d.db_freed_in_flight = FALSE;
+	}
+
+	db->db_dirtied = tx->tx_txg;
+	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If writting this buffer will consume a new block on disk,
+	 * then update the accounting.
+	 */
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+			/*
+			 * This is only a guess -- if the dbuf is dirty
+			 * in a previous txg, we don't know how much
+			 * space it will use on disk yet.  We should
+			 * really have the struct_rwlock to access
+			 * db_blkptr, but since this is just a guess,
+			 * it's OK if we get an odd answer.
+			 */
+			dnode_willuse_space(dn,
+			    -BP_GET_ASIZE(db->db_blkptr), tx);
+		}
+		dnode_willuse_space(dn, db->db.db_size, tx);
+	}
+
+	/*
+	 * This buffer is now part of this txg
+	 */
+	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	db->db_dirtycnt += 1;
+	ASSERT3U(db->db_dirtycnt, <=, 3);
+
+	mutex_exit(&db->db_mtx);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		dnode_setdirty(dn, tx);
+		return;
+	}
+
+	if (db->db_level == 0)
+		dnode_new_blkid(dn, db->db_blkid, tx);
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (db->db_level < dn->dn_nlevels-1) {
+		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		dmu_buf_impl_t *parent;
+		parent = dbuf_hold_level(dn, db->db_level+1,
+		    db->db_blkid >> epbs, FTAG);
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+		dbuf_dirty(parent, tx);
+		dbuf_remove_ref(parent, FTAG);
+	} else {
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	dnode_setdirty(dn, tx);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(tx->tx_txg != 0);
+
+	mutex_enter(&db->db_mtx);
+
+	/*
+	 * If this buffer is not dirty, we're done.
+	 */
+	if (!list_link_active(&db->db_dirty_node[txgoff])) {
+		mutex_exit(&db->db_mtx);
+		return (0);
+	}
+
+	/*
+	 * If this buffer is currently held, we cannot undirty
+	 * it, since one of the current holders may be in the
+	 * middle of an update.  Note that users of dbuf_undirty()
+	 * should not place a hold on the dbuf before the call.
+	 * XXX - this check assumes we are being called from
+	 * dbuf_free_range(), perhaps we should move it there?
+	 */
+	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+		mutex_exit(&db->db_mtx);
+		mutex_enter(&dn->dn_mtx);
+		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		mutex_exit(&dn->dn_mtx);
+		return (0);
+	}
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	dbuf_unoverride(db, tx->tx_txg);
+
+	ASSERT(db->db.db_size != 0);
+	if (db->db_level == 0) {
+		ASSERT(db->db_buf != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
+		if (db->db_d.db_data_old[txgoff] != db->db_buf)
+			arc_buf_free(db->db_d.db_data_old[txgoff], db);
+		db->db_d.db_data_old[txgoff] = NULL;
+	}
+
+	/* XXX would be nice to fix up dn_towrite_space[] */
+	/* XXX undo db_dirtied? but how? */
+	/* db->db_dirtied = tx->tx_txg; */
+
+	mutex_enter(&dn->dn_mtx);
+	list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+
+	if (refcount_remove(&db->db_holds,
+	    (void *)(uintptr_t)tx->tx_txg) == 0) {
+		/* make duf_verify() happy */
+		if (db->db.db_data)
+			bzero(db->db.db_data, db->db.db_size);
+
+		dbuf_evict(db);
+		return (1);
+	}
+
+	mutex_exit(&db->db_mtx);
+	return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	int rf = DB_RF_MUST_SUCCEED;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+		rf |= DB_RF_HAVESTRUCT;
+	(void) dbuf_read_generic(db, rf);
+	dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_will_fill = dbuf_will_fill
+void
+dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_level == 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+	    dmu_tx_private_ok(tx));
+
+	dbuf_noread(db);
+	dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	mutex_enter(&db->db_mtx);
+	dbuf_verify(db);
+
+	if (db->db_state == DB_FILL) {
+		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+			/* we were freed while filling */
+			/* XXX dbuf_undirty? */
+			bzero(db->db.db_data, db->db.db_size);
+			db->db_d.db_freed_in_flight = FALSE;
+		}
+		db->db_state = DB_CACHED;
+		cv_broadcast(&db->db_changed);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+
+static void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	if (db->db_state == DB_CACHED) {
+		ASSERT(db->db_buf != NULL);
+		arc_buf_free(db->db_buf, db);
+		db->db.db_data = NULL;
+		db->db_buf = NULL;
+		db->db_state = DB_UNCACHED;
+	}
+
+	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	mutex_exit(&db->db_mtx);
+
+	/*
+	 * If this dbuf is referened from an indirect dbuf,
+	 * decrement the ref count on the indirect dbuf.
+	 */
+	if (db->db_parent && db->db_parent != dn->dn_dbuf)
+		dbuf_remove_ref(db->db_parent, db);
+
+	/* remove from dn_dbufs */
+	list_remove(&dn->dn_dbufs, db);
+
+	dnode_rele(dn, db);
+
+	dbuf_hash_remove(db);
+
+	db->db_dnode = NULL;
+	db->db_parent = NULL;
+	db->db_blkptr = NULL;
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+    dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+	int nlevels, epbs;
+
+	if (dn->dn_phys->dn_nlevels == 0)
+		nlevels = 1;
+	else
+		nlevels = dn->dn_phys->dn_nlevels;
+
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	ASSERT3U(level * epbs, <, 64);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	if (blkid == DB_BONUS_BLKID) {
+		/* this is the bonus buffer */
+		*parentp = NULL;
+		*bpp = NULL;
+		return (0);
+	} else if (level >= nlevels ||
+	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+		/* the buffer has no parent yet */
+		*parentp = NULL;
+		*bpp = NULL;
+		return (ENOENT);
+	} else if (level < nlevels-1) {
+		/* this block is referenced from an indirect block */
+		int err = dbuf_hold_impl(dn, level+1,
+		    blkid >> epbs, fail_sparse, NULL, parentp);
+		if (err)
+			return (err);
+		dbuf_read_havestruct(*parentp);
+		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+		    (blkid & ((1ULL << epbs) - 1));
+		return (0);
+	} else {
+		/* the block is referenced from the dnode */
+		ASSERT3U(level, ==, nlevels-1);
+		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+		    blkid < dn->dn_phys->dn_nblkptr);
+		*parentp = dn->dn_dbuf;
+		*bpp = &dn->dn_phys->dn_blkptr[blkid];
+		return (0);
+	}
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+	objset_impl_t *os = dn->dn_objset;
+	dmu_buf_impl_t *db, *odb;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+	db->db_objset = os;
+	db->db.db_object = dn->dn_object;
+	db->db_level = level;
+	db->db_blkid = blkid;
+	db->db_state = DB_UNCACHED;
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		db->db.db_size = dn->dn_bonuslen;
+		db->db.db_offset = DB_BONUS_BLKID;
+	} else {
+		int blocksize =
+		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+		db->db.db_size = blocksize;
+		db->db.db_offset = db->db_blkid * blocksize;
+	}
+
+	db->db_dirtied = 0;
+	db->db_dirtycnt = 0;
+
+	bzero(&db->db_d, sizeof (db->db_d));
+
+	/*
+	 * Hold the dn_dbufs_mtx while we get the new dbuf
+	 * in the hash table *and* added to the dbufs list.
+	 * This prevents a possible deadlock with someone
+	 * trying to look up this dbuf before its added to the
+	 * dn_dbufs list.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	if ((odb = dbuf_hash_insert(db)) != NULL) {
+		/* someone else inserted it first */
+		kmem_cache_free(dbuf_cache, db);
+		mutex_exit(&dn->dn_dbufs_mtx);
+		return (odb);
+	}
+	list_insert_head(&dn->dn_dbufs, db);
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_add_ref(parent, db);
+
+	(void) refcount_add(&dn->dn_holds, db);
+
+	db->db_dnode = dn;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
+
+	dprintf_dbuf(db, "db=%p\n", db);
+
+	return (db);
+}
+
+static int
+dbuf_evictable(dmu_buf_impl_t *db)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	dbuf_verify(db);
+
+	if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
+		return (FALSE);
+
+	if (!refcount_is_zero(&db->db_holds))
+		return (FALSE);
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&db->db_dirty_node[i]));
+		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	}
+#endif
+
+	/*
+	 * Now we know we want to free it.
+	 * This call must be done last, since it has side effects -
+	 * calling the db_evict_func().
+	 */
+	dbuf_evict_user(db);
+	return (TRUE);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_dnode == NULL);
+	ASSERT(db->db_parent == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	kmem_cache_free(dbuf_cache, db);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+	dmu_buf_impl_t *db, *parent = NULL;
+	blkptr_t *bp = NULL;
+
+	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	if (dnode_block_freed(dn, blkid))
+		return;
+
+	/* dbuf_find() returns with db_mtx held */
+	if (db = dbuf_find(dn, 0, blkid)) {
+		/*
+		 * This dbuf is already in the cache.  We assume that
+		 * it is already CACHED, or else about to be either
+		 * read or filled.
+		 */
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
+		if (bp && !BP_IS_HOLE(bp)) {
+			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+			    dmu_ot[dn->dn_type].ot_byteswap,
+			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    (ARC_NOWAIT | ARC_PREFETCH));
+		}
+		if (parent && parent != dn->dn_dbuf)
+			dbuf_rele(parent);
+	}
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+    void *tag, dmu_buf_impl_t **dbp)
+{
+	dmu_buf_impl_t *db, *parent = NULL;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT3U(dn->dn_nlevels, >, level);
+
+	*dbp = NULL;
+
+	/* dbuf_find() returns with db_mtx held */
+	db = dbuf_find(dn, level, blkid);
+
+	if (db == NULL) {
+		blkptr_t *bp = NULL;
+		int err;
+
+		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+		if (fail_sparse) {
+			if (err == 0 && bp && BP_IS_HOLE(bp))
+				err = ENOENT;
+			if (err) {
+				if (parent && parent != dn->dn_dbuf)
+					dbuf_rele(parent);
+				return (err);
+			}
+		}
+		db = dbuf_create(dn, level, blkid, parent, bp);
+	}
+
+	/*
+	 * If this buffer is currently syncing out, and we are
+	 * are still referencing it from db_data, we need to make
+	 * a copy of it in case we decide we want to dirty it
+	 * again in this txg.
+	 */
+	if (db->db_level == 0 && db->db_state == DB_CACHED &&
+	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    db->db_data_pending == db->db_buf) {
+		int size = (db->db_blkid == DB_BONUS_BLKID) ?
+		    DN_MAX_BONUSLEN : db->db.db_size;
+
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    size, db));
+		bcopy(db->db_data_pending->b_data, db->db.db_data,
+		    db->db.db_size);
+	}
+
+	dbuf_add_ref(db, tag);
+	dbuf_update_data(db);
+	dbuf_verify(db);
+	mutex_exit(&db->db_mtx);
+
+	/* NOTE: we can't rele the parent until after we drop the db_mtx */
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_rele(parent);
+
+	ASSERT3P(db->db_dnode, ==, dn);
+	ASSERT3U(db->db_blkid, ==, blkid);
+	ASSERT3U(db->db_level, ==, level);
+	*dbp = db;
+
+	return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid)
+{
+	dmu_buf_impl_t *db;
+	(void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
+	return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+	dmu_buf_impl_t *db;
+	(void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_bonus(dnode_t *dn, void *tag)
+{
+	dmu_buf_impl_t *db;
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	(void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
+	rw_exit(&dn->dn_struct_rwlock);
+	return (db);
+}
+
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+	(void) refcount_add(&db->db_holds, tag);
+	/* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+}
+
+void
+dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+{
+	int64_t holds;
+	dnode_t *dn = db->db_dnode;
+	int need_mutex;
+
+	ASSERT(dn != NULL);
+	need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+
+	if (need_mutex) {
+		dnode_add_ref(dn, FTAG);
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify(db);
+
+	holds = refcount_remove(&db->db_holds, tag);
+
+	if (holds == 0) {
+		ASSERT3U(db->db_state, !=, DB_FILL);
+		if (db->db_level == 0 &&
+		    db->db_d.db_user_data_ptr_ptr != NULL)
+			*db->db_d.db_user_data_ptr_ptr = NULL;
+		dbuf_evict(db);
+	} else {
+		if (holds == db->db_dirtycnt &&
+		    db->db_level == 0 && db->db_d.db_immediate_evict)
+			dbuf_evict_user(db);
+		mutex_exit(&db->db_mtx);
+	}
+
+	if (need_mutex) {
+		mutex_exit(&dn->dn_dbufs_mtx);
+		dnode_rele(dn, FTAG);
+	}
+}
+
+void
+dbuf_rele(dmu_buf_impl_t *db)
+{
+	dbuf_remove_ref(db, NULL);
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+	return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *evict_func)
+{
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+	    user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *evict_func)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_d.db_immediate_evict = TRUE;
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+	    user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT(db->db_level == 0);
+
+	ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_d.db_user_ptr == old_user_ptr) {
+		db->db_d.db_user_ptr = user_ptr;
+		db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
+		db->db_d.db_evict_func = evict_func;
+
+		dbuf_update_data(db);
+	} else {
+		old_user_ptr = db->db_d.db_user_ptr;
+	}
+
+	mutex_exit(&db->db_mtx);
+	return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	return (db->db_d.db_user_ptr);
+}
+
+void
+dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
+{
+	arc_buf_t **data;
+	uint64_t txg = tx->tx_txg;
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	int blksz;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * To be synced, we must be dirtied.  But we
+	 * might have been freed after the dirty.
+	 */
+	if (db->db_state == DB_UNCACHED) {
+		/* This buffer has been freed since it was dirtied */
+		ASSERT(db->db.db_data == NULL);
+	} else if (db->db_state == DB_FILL) {
+		/* This buffer was freed and is now being re-filled */
+		ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	dbuf_verify(db);
+
+	/*
+	 * Don't need a lock on db_dirty (dn_mtx), because it can't
+	 * be modified yet.
+	 */
+
+	if (db->db_level == 0) {
+		data = &db->db_d.db_data_old[txg&TXG_MASK];
+		blksz = arc_buf_size(*data);
+		/*
+		 * If this buffer is currently "in use" (i.e., there are
+		 * active holds and db_data still references it), then make
+		 * a copy before we start the write so that any modifications
+		 * from the open txg will not leak into this write.
+		 *
+		 * NOTE: this copy does not need to be made for objects only
+		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
+		 * or if there is no actual write involved (bonus blocks).
+		 */
+		if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
+		    db->db_blkid != DB_BONUS_BLKID) {
+			if (refcount_count(&db->db_holds) > 1 &&
+			    *data == db->db_buf) {
+				*data = arc_buf_alloc(
+				    db->db_dnode->dn_objset->os_spa, blksz, db);
+				bcopy(db->db.db_data, (*data)->b_data, blksz);
+			}
+			db->db_data_pending = *data;
+		} else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			arc_release(db->db_buf, db);
+		}
+	} else {
+		data = &db->db_buf;
+		if (*data == NULL) {
+			/*
+			 * This can happen if we dirty and then free
+			 * the level-0 data blocks in the same txg. So
+			 * this indirect remains unchanged.
+			 */
+			if (db->db_dirtied == txg)
+				db->db_dirtied = 0;
+			ASSERT(db->db_dirtycnt > 0);
+			db->db_dirtycnt -= 1;
+			mutex_exit(&db->db_mtx);
+			dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+			return;
+		}
+		blksz = db->db.db_size;
+		ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
+	}
+
+	ASSERT(*data != NULL);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		/*
+		 * Simply copy the bonus data into the dnode.  It will
+		 * be written out when the dnode is synced (and it will
+		 * be synced, since it must have been dirty for dbuf_sync
+		 * to be called).  The bonus data will be byte swapped
+		 * in dnode_byteswap.
+		 */
+		/*
+		 * Use dn_phys->dn_bonuslen since db.db_size is the length
+		 * of the bonus buffer in the open transaction rather than
+		 * the syncing transaction.
+		 */
+		ASSERT3U(db->db_level, ==, 0);
+		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
+		bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
+		    dn->dn_phys->dn_bonuslen);
+		if (*data != db->db_buf)
+			arc_buf_free(*data, db);
+		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		return;
+	} else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+		/*
+		 * This indirect buffer was marked dirty, but
+		 * never modified (if it had been modified, then
+		 * we would have released the buffer).  There is
+		 * no reason to write anything.
+		 */
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		return;
+	} else if (db->db_blkptr == NULL &&
+	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
+	    db->db_blkid < dn->dn_phys->dn_nblkptr) {
+		/*
+		 * This buffer was allocated at a time when there was
+		 * no available blkptrs from the dnode, or it was
+		 * inappropriate to hook it in (i.e., nlevels mis-match).
+		 */
+		ASSERT(db->db_blkptr == NULL);
+		ASSERT(db->db_parent == NULL);
+		db->db_parent = dn->dn_dbuf;
+		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+		dbuf_verify(db);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_blkptr == NULL) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		mutex_exit(&db->db_mtx);
+		ASSERT(dn->dn_phys->dn_nlevels > 1);
+		if (parent == NULL) {
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			(void) dbuf_hold_impl(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FALSE, NULL, &parent);
+			rw_exit(&dn->dn_struct_rwlock);
+			dbuf_add_ref(parent, db);
+			db->db_parent = parent;
+			dbuf_rele(parent);
+		}
+		dbuf_read(parent);
+	} else {
+		mutex_exit(&db->db_mtx);
+	}
+
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+
+	if (db->db_parent != dn->dn_dbuf) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		mutex_enter(&db->db_mtx);
+		ASSERT(db->db_level == parent->db_level-1);
+		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
+		/*
+		 * We may have read this block after we dirtied it,
+		 * so never released it from the cache.
+		 */
+		arc_release(parent->db_buf, parent);
+
+		db->db_blkptr = (blkptr_t *)parent->db.db_data +
+		    (db->db_blkid & ((1ULL << epbs) - 1));
+		dbuf_verify(db);
+		mutex_exit(&db->db_mtx);
+	}
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+#ifdef ZFS_DEBUG
+	if (db->db_parent == dn->dn_dbuf) {
+		/*
+		 * We don't need to dnode_setdirty(dn) because if we got
+		 * here then the parent is already dirty.
+		 */
+		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+		ASSERT3P(db->db_blkptr, ==,
+		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+	}
+#endif
+	if (db->db_level == 0 &&
+	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
+		int old_size = BP_GET_ASIZE(db->db_blkptr);
+		int new_size = BP_GET_ASIZE(*bpp);
+
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		dnode_diduse_space(dn, new_size-old_size);
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		mutex_exit(&dn->dn_mtx);
+
+		dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
+		if (!BP_IS_HOLE(db->db_blkptr))
+			dsl_dataset_block_kill(os->os_dsl_dataset,
+			    db->db_blkptr, os->os_synctx);
+
+		mutex_enter(&db->db_mtx);
+		*db->db_blkptr = **bpp;
+		kmem_free(*bpp, sizeof (blkptr_t));
+		*bpp = NULL;
+
+		if (*old != db->db_buf)
+			arc_buf_free(*old, db);
+		*old = NULL;
+		db->db_data_pending = NULL;
+
+		cv_broadcast(&db->db_changed);
+
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+	} else {
+		int checksum, compress;
+
+		if (db->db_level > 0) {
+			/*
+			 * XXX -- we should design a compression algorithm
+			 * that specializes in arrays of bps.
+			 */
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+			compress = ZIO_COMPRESS_LZJB;
+		} else {
+			/*
+			 * Allow dnode settings to override objset settings,
+			 * except for metadata checksums.
+			 */
+			if (dmu_ot[dn->dn_type].ot_metadata) {
+				checksum = os->os_md_checksum;
+				compress = zio_compress_select(dn->dn_compress,
+				    os->os_md_compress);
+			} else {
+				checksum = zio_checksum_select(dn->dn_checksum,
+				    os->os_checksum);
+				compress = zio_compress_select(dn->dn_compress,
+				    os->os_compress);
+			}
+		}
+#ifdef ZFS_DEBUG
+		if (db->db_parent) {
+			ASSERT(list_link_active(
+			    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+			ASSERT(db->db_parent == dn->dn_dbuf ||
+			    db->db_parent->db_level > 0);
+			if (dn->dn_object & DMU_PRIVATE_OBJECT ||
+			    db->db_level > 0)
+				ASSERT(*data == db->db_buf);
+		}
+#endif
+		ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+		(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+		    db->db_blkptr, *data, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
+		/*
+		 * We can't access db after arc_write, since it could finish
+		 * and be freed, and we have no locks on it.
+		 */
+	}
+}
+
+struct dbuf_arg {
+	objset_impl_t *os;
+	blkptr_t bp;
+};
+
+static void
+dbuf_do_born(void *arg)
+{
+	struct dbuf_arg *da = arg;
+	dsl_dataset_block_born(da->os->os_dsl_dataset,
+	    &da->bp, da->os->os_synctx);
+	kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+static void
+dbuf_do_kill(void *arg)
+{
+	struct dbuf_arg *da = arg;
+	dsl_dataset_block_kill(da->os->os_dsl_dataset,
+	    &da->bp, da->os->os_synctx);
+	kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t txg = zio->io_txg;
+	uint64_t fill = 0;
+	int i;
+	int old_size, new_size;
+
+	ASSERT3U(zio->io_error, ==, 0);
+
+	dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
+
+	old_size = BP_GET_ASIZE(&zio->io_bp_orig);
+	new_size = BP_GET_ASIZE(zio->io_bp);
+
+	dnode_diduse_space(dn, new_size-old_size);
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_dirtied == txg)
+		db->db_dirtied = 0;
+
+	if (db->db_level == 0) {
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		if (*old != db->db_buf)
+			arc_buf_free(*old, db);
+		*old = NULL;
+		db->db_data_pending = NULL;
+
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+		    !BP_IS_HOLE(db->db_blkptr))
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		mutex_exit(&dn->dn_mtx);
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			dnode_phys_t *dnp = db->db.db_data;
+			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+			    i--, dnp++) {
+				if (dnp->dn_type != DMU_OT_NONE)
+					fill++;
+			}
+		} else {
+			if (!BP_IS_HOLE(db->db_blkptr))
+				fill = 1;
+		}
+	} else {
+		blkptr_t *bp = db->db.db_data;
+		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		if (!BP_IS_HOLE(db->db_blkptr)) {
+			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
+			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+			    db->db.db_size);
+		}
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			ASSERT3U(BP_GET_LSIZE(bp), ==,
+			    db->db_level == 1 ? dn->dn_datablksz :
+			    (1<<dn->dn_phys->dn_indblkshift));
+			fill += bp->blk_fill;
+		}
+	}
+
+	if (!BP_IS_HOLE(db->db_blkptr)) {
+		db->db_blkptr->blk_fill = fill;
+		BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+		BP_SET_LEVEL(db->db_blkptr, db->db_level);
+	} else {
+		ASSERT3U(fill, ==, 0);
+		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+	}
+
+	dprintf_dbuf_bp(db, db->db_blkptr,
+	    "wrote %llu bytes to blkptr:", zio->io_size);
+
+	ASSERT(db->db_parent == NULL ||
+	    list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
+	cv_broadcast(&db->db_changed);
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+	mutex_exit(&db->db_mtx);
+
+	/* We must do this after we've set the bp's type and level */
+	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+	    BP_IDENTITY(&zio->io_bp_orig))) {
+		struct dbuf_arg *da;
+		da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+		da->os = os;
+		da->bp = *zio->io_bp;
+		(void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
+		if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+			da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+			da->os = os;
+			da->bp = zio->io_bp_orig;
+			(void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
+		}
+	}
+
+	dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
new file mode 100644
index 0000000000..14fab6d420
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1761 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
+	{	zap_byteswap,		TRUE,	"object directory"	},
+	{	byteswap_uint64_array,	TRUE,	"object array"		},
+	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
+	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
+	{	byteswap_uint64_array,	TRUE,	"bplist"		},
+	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
+	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
+	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
+	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
+	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
+	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
+	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
+	{	zap_byteswap,		TRUE,	"DSL directory child map"},
+	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
+	{	zap_byteswap,		TRUE,	"DSL props"		},
+	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
+	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
+	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
+	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
+	{	zap_byteswap,		TRUE,	"ZFS directory"		},
+	{	zap_byteswap,		TRUE,	"ZFS master node"	},
+	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
+	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
+	{	zap_byteswap,		TRUE,	"zvol prop"		},
+	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
+	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
+	{	zap_byteswap,		TRUE,	"other ZAP"		},
+};
+
+static int
+dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
+{
+	int i, err = 0;
+	dnode_t *dn;
+	zio_t *zio;
+	int canfail;
+	uint64_t rd_sz;
+
+	if (numbufs == 0)
+		return (0);
+
+	rd_sz = numbufs * dbp[0]->db.db_size;
+	ASSERT(rd_sz <= DMU_MAX_ACCESS);
+
+	dn = dbp[0]->db_dnode;
+	if (flags & DB_RF_CANFAIL) {
+		canfail = 1;
+	} else {
+		canfail = 0;
+	}
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
+
+	/* don't prefetch if read the read is large */
+	if (rd_sz >= zfetch_array_rd_sz) {
+		flags |= DB_RF_NOPREFETCH;
+	}
+
+	/* initiate async reads */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i]->db_state == DB_UNCACHED)
+			dbuf_read_impl(dbp[i], zio, flags);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+	err = zio_wait(zio);
+
+	if (err)
+		return (err);
+
+	/* wait for other io to complete */
+	for (i = 0; i < numbufs; i++) {
+		mutex_enter(&dbp[i]->db_mtx);
+		while (dbp[i]->db_state == DB_READ ||
+		    dbp[i]->db_state == DB_FILL)
+			cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
+		ASSERT(dbp[i]->db_state == DB_CACHED);
+		mutex_exit(&dbp[i]->db_mtx);
+	}
+
+	return (0);
+}
+
+void
+dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+	int err;
+
+	err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
+	ASSERT(err == 0);
+}
+
+int
+dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
+{
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
+}
+
+dmu_buf_t *
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	/* dataset_verify(dd); */
+
+	dn = dnode_hold(os->os, object, FTAG);
+	blkid = dbuf_whichblock(dn, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (&db->db);
+}
+
+dmu_buf_t *
+dmu_bonus_hold(objset_t *os, uint64_t object)
+{
+	return (dmu_bonus_hold_tag(os, object, NULL));
+}
+
+int
+dmu_bonus_max(void)
+{
+	return (DN_MAX_BONUSLEN);
+}
+
+/*
+ * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ */
+dmu_buf_t *
+dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dmu_buf_impl_t *db;
+
+	if (dn == NULL)
+		return (NULL);
+
+	db = dbuf_hold_bonus(dn, tag);
+	/* XXX - hack: hold the first block if this is a ZAP object */
+	if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		dn->dn_db0 = dbuf_hold(dn, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+	dnode_rele(dn, FTAG);
+	return (&db->db);
+}
+
+static dmu_buf_t **
+dbuf_hold_array(dnode_t *dn,
+    uint64_t offset, uint64_t length, int *numbufsp)
+{
+	dmu_buf_t **dbp;
+	uint64_t blkid, nblks, i;
+
+	if (length == 0) {
+		if (numbufsp)
+			*numbufsp = 0;
+		return (NULL);
+	}
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+			P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+	} else {
+		ASSERT3U(offset + length, <=, dn->dn_datablksz);
+		nblks = 1;
+	}
+	dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+	blkid = dbuf_whichblock(dn, offset);
+	for (i = 0; i < nblks; i++) {
+		dmu_buf_impl_t *dbuf;
+		dbuf = dbuf_hold(dn, blkid+i);
+		dbp[i] = &dbuf->db;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (numbufsp)
+		*numbufsp = nblks;
+	return (dbp);
+}
+
+dmu_buf_t **
+dmu_buf_hold_array(objset_t *os, uint64_t object,
+	uint64_t offset, uint64_t length, int *numbufsp)
+{
+	dnode_t *dn;
+	dmu_buf_t **dbp;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
+
+	if (length == 0) {
+		if (numbufsp)
+			*numbufsp = 0;
+		return (NULL);
+	}
+
+	dn = dnode_hold(os->os, object, FTAG);
+	dbp = dbuf_hold_array(dn, offset, length, numbufsp);
+	dnode_rele(dn, FTAG);
+
+	return (dbp);
+}
+
+void
+dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dbuf_add_ref(db, tag);
+}
+
+void
+dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *dbuf_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+	/* XXX - hack: hold the first block  if this is a ZAP object */
+	if (db->db_blkid == DB_BONUS_BLKID &&
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+		dbuf_rele(db->db_dnode->dn_db0);
+	dbuf_rele(db);
+}
+
+void
+dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+	/* XXX - hack: hold the first block  if this is a ZAP object */
+	if (db->db_blkid == DB_BONUS_BLKID &&
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+		dbuf_rele(db->db_dnode->dn_db0);
+	dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+	int i;
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	if (numbufs == 0)
+		return;
+
+	ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
+
+	for (i = 0; i < numbufs; i++)
+		dbuf_rele(dbp[i]);
+
+	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	int nblks, i;
+
+	if (len == 0) {  /* they're interested in the bonus buffer */
+		dn = os->os->os_meta_dnode;
+
+		if (object == 0 || object >= DN_MAX_OBJECT)
+			return;
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, blkid);
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
+
+	/*
+	 * XXX - Note, if the dnode for the requested object is not
+	 * already cached, we will do a *synchronous* read in the
+	 * dnode_hold() call.  The same is true for any indirects.
+	 */
+	dn = dnode_hold(os->os, object, FTAG);
+	if (dn == NULL)
+		return;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+			P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+	} else {
+		nblks = (offset < dn->dn_datablksz);
+	}
+
+	if (nblks != 0) {
+		blkid = dbuf_whichblock(dn, offset);
+		for (i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, blkid+i);
+	}
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(offset < UINT64_MAX);
+	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+	dnode_free_range(dn, offset, size, tx);
+	dnode_rele(dn, FTAG);
+}
+
+static int
+dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
+{
+	dnode_t *dn;
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	dn = dnode_hold(os->os, object, FTAG);
+
+	if (dn->dn_datablkshift == 0) {
+		int newsz = offset > dn->dn_datablksz ? 0 :
+		    MIN(size, dn->dn_datablksz - offset);
+		bzero((char *)buf + newsz, size - newsz);
+		size = newsz;
+	}
+
+	dnode_rele(dn, FTAG);
+
+	if (size == 0)
+		return (0);
+
+	while (size > 0) {
+		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		int err;
+
+		/*
+		 * NB: we could do this block-at-a-time, but it's nice
+		 * to be reading in parallel.
+		 */
+		dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
+		err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
+		    flags);
+		if (err) {
+			dmu_buf_rele_array(dbp, numbufs);
+			return (err);
+		}
+
+		for (i = 0; i < numbufs; i++) {
+			int tocpy;
+			int bufoff;
+			dmu_buf_t *db = dbp[i];
+
+			ASSERT(size > 0);
+
+			bufoff = offset - db->db_offset;
+			tocpy = (int)MIN(db->db_size - bufoff, size);
+
+			bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+			offset += tocpy;
+			size -= tocpy;
+			buf = (char *)buf + tocpy;
+		}
+		dmu_buf_rele_array(dbp, numbufs);
+	}
+	return (0);
+}
+
+void
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
+{
+	int err;
+
+	err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
+	ASSERT3U(err, ==, 0);
+}
+
+int
+dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
+{
+	return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+		buf = (char *)buf + tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs);
+}
+
+#ifdef _KERNEL
+int
+dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    uio_t *uio, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+	int err = 0;
+
+	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		/*
+		 * XXX uiomove could block forever (eg. nfs-backed
+		 * pages).  There needs to be a uiolockdown() function
+		 * to lock the pages in memory, so that uiomove won't
+		 * block.
+		 */
+		err = uiomove((char *)db->db_data + bufoff, tocpy,
+		    UIO_WRITE, uio);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		if (err)
+			break;
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs);
+	return (err);
+}
+#endif
+
+struct backuparg {
+	dmu_replay_record_t *drr;
+	vnode_t *vp;
+	objset_t *os;
+	int err;
+};
+
+static int
+dump_bytes(struct backuparg *ba, void *buf, int len)
+{
+	ssize_t resid; /* have to get resid to get detailed errno */
+	/* Need to compute checksum here */
+	ASSERT3U(len % 8, ==, 0);
+	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
+	    (caddr_t)buf, len,
+	    0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
+	return (ba->err);
+}
+
+static int
+dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	/* write a FREE record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_FREE;
+	ba->drr->drr_u.drr_free.drr_object = object;
+	ba->drr->drr_u.drr_free.drr_offset = offset;
+	ba->drr->drr_u.drr_free.drr_length = length;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_data(struct backuparg *ba, dmu_object_type_t type,
+    uint64_t object, uint64_t offset, int blksz, void *data)
+{
+	/* write a DATA record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_WRITE;
+	ba->drr->drr_u.drr_write.drr_object = object;
+	ba->drr->drr_u.drr_write.drr_type = type;
+	ba->drr->drr_u.drr_write.drr_offset = offset;
+	ba->drr->drr_u.drr_write.drr_length = blksz;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	if (dump_bytes(ba, data, blksz))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+{
+	/* write a FREEOBJECTS record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_FREEOBJECTS;
+	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
+	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+{
+	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+		return (dump_freeobjects(ba, object, 1));
+
+	/* write an OBJECT record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_OBJECT;
+	ba->drr->drr_u.drr_object.drr_object = object;
+	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
+	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
+	ba->drr->drr_u.drr_object.drr_blksz =
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
+	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
+	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+
+	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+		return (EINTR);
+
+	/* free anything past the end of the file */
+	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+		return (EINTR);
+	if (ba->err)
+		return (EINTR);
+	return (0);
+}
+
+#define	BP_SPAN(dnp, level) \
+	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+static int
+backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	struct backuparg *ba = arg;
+	uint64_t object = bc->bc_bookmark.zb_object;
+	int level = bc->bc_bookmark.zb_level;
+	uint64_t blkid = bc->bc_bookmark.zb_blkid;
+	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
+	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+	void *data = bc->bc_data;
+	int err = 0;
+
+	if (issig(JUSTLOOKING))
+		return (EINTR);
+
+	ASSERT(data || bp == NULL);
+
+	if (bp == NULL && object == 0) {
+		uint64_t span = BP_SPAN(bc->bc_dnode, level);
+		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
+	} else if (bp == NULL) {
+		uint64_t span = BP_SPAN(bc->bc_dnode, level);
+		err = dump_free(ba, object, blkid * span, span);
+	} else if (data && level == 0 && type == DMU_OT_DNODE) {
+		dnode_phys_t *blk = data;
+		int i;
+		int blksz = BP_GET_LSIZE(bp);
+
+		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			uint64_t dnobj =
+			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+			err = dump_dnode(ba, dnobj, blk+i);
+			if (err)
+				break;
+		}
+	} else if (level == 0 &&
+	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+		int blksz = BP_GET_LSIZE(bp);
+		if (data == NULL) {
+			arc_buf_t *abuf;
+
+			(void) arc_read(NULL, spa, bp,
+			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
+			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
+			    ARC_WAIT);
+
+			if (abuf) {
+				err = dump_data(ba, type, object, blkid * blksz,
+				    blksz, abuf->b_data);
+				arc_buf_free(abuf, &abuf);
+			}
+		} else {
+			err = dump_data(ba, type, object, blkid * blksz,
+			    blksz, data);
+		}
+	}
+
+	ASSERT(err == 0 || err == EINTR);
+	return (err);
+}
+
+int
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
+{
+	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
+	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+	dmu_replay_record_t *drr;
+	struct backuparg ba;
+	int err;
+
+	/* tosnap must be a snapshot */
+	if (ds->ds_phys->ds_next_snap_obj == 0)
+		return (EINVAL);
+
+	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
+	if (fromds && (ds->ds_dir != fromds->ds_dir ||
+	    fromds->ds_phys->ds_creation_txg >=
+	    ds->ds_phys->ds_creation_txg))
+		return (EXDEV);
+
+	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+	drr->drr_u.drr_begin.drr_creation_time =
+	    ds->ds_phys->ds_creation_time;
+	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+	if (fromds)
+		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
+	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+
+	ba.drr = drr;
+	ba.vp = vp;
+	ba.os = tosnap;
+
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+		kmem_free(drr, sizeof (dmu_replay_record_t));
+		return (ba.err);
+	}
+
+	err = traverse_dsl_dataset(ds,
+	    fromds ? fromds->ds_phys->ds_creation_txg : 0,
+	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+	    backup_cb, &ba);
+
+	if (err) {
+		if (err == EINTR && ba.err)
+			err = ba.err;
+		return (err);
+	}
+
+	bzero(drr, sizeof (dmu_replay_record_t));
+	drr->drr_type = DRR_END;
+
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
+		return (ba.err);
+
+	kmem_free(drr, sizeof (dmu_replay_record_t));
+
+	return (0);
+}
+
+struct restorearg {
+	int err;
+	int byteswap;
+	vnode_t *vp;
+	char *buf;
+	uint64_t voff;
+	int buflen; /* number of valid bytes in buf */
+	int bufoff; /* next offset to read */
+	int bufsize; /* amount of memory allocated for buf */
+};
+
+static int
+replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	dsl_dataset_t *ds = NULL;
+	dsl_dataset_t *ds_prev = NULL;
+	const char *snapname;
+	int err = EINVAL;
+	uint64_t val;
+
+	/* this must be a filesytem */
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		goto die;
+
+	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG);
+
+	if (ds == NULL) {
+		err = EBUSY;
+		goto die;
+	}
+
+	/* must already be a snapshot of this fs */
+	if (ds->ds_phys->ds_prev_snap_obj == 0) {
+		err = ENODEV;
+		goto die;
+	}
+
+	/* most recent snapshot must match fromguid */
+	ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+	    ds->ds_phys->ds_prev_snap_obj, NULL,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+	if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
+		err = ENODEV;
+		goto die;
+	}
+
+	/* must not have any changes since most recent snapshot */
+	if (ds->ds_phys->ds_bp.blk_birth >
+	    ds_prev->ds_phys->ds_creation_txg) {
+		err = ETXTBSY;
+		goto die;
+	}
+
+	/* new snapshot name must not exist */
+	snapname = strrchr(drrb->drr_toname, '@');
+	if (snapname == NULL) {
+		err = EEXIST;
+		goto die;
+	}
+	snapname++;
+	err = zap_lookup(dd->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
+	if (err != ENOENT) {
+		if (err == 0)
+			err = EEXIST;
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+		return (err);
+	}
+
+	dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+
+	/* The point of no (unsuccessful) return. */
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = TRUE;
+
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (0);
+
+die:
+	if (ds_prev)
+		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+	if (ds)
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (err);
+}
+
+static int
+replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	int err;
+	char *fsfullname, *fslastname, *cp;
+	dsl_dataset_t *ds;
+
+	fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	(void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN);
+	cp = strchr(fsfullname, '@');
+	if (cp == NULL) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (EINVAL);
+	}
+	*cp = '\0';
+	fslastname = strrchr(fsfullname, '/');
+	if (fslastname == NULL) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (EINVAL);
+	}
+	fslastname++;
+
+	err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx);
+	if (err) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (err);
+	}
+
+	/* the point of no (unsuccessful) return */
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+	    DS_MODE_EXCLUSIVE, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+	kmem_free(fsfullname, MAXNAMELEN);
+
+	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+	    ds, drrb->drr_type, tx);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = TRUE;
+
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (0);
+}
+
+static int
+replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	int err;
+	char *snapname;
+	dsl_dataset_t *ds;
+
+	/* XXX verify that drr_toname is in dd */
+
+	snapname = strchr(drrb->drr_toname, '@');
+	if (snapname == NULL)
+		return (EINVAL);
+	snapname++;
+
+	/* create snapshot */
+	err = dsl_dataset_snapshot_sync(dd, snapname, tx);
+	if (err)
+		return (err);
+
+	/* set snapshot's creation time and guid */
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
+	ds->ds_phys->ds_guid = drrb->drr_toguid;
+	ds->ds_phys->ds_restoring = FALSE;
+
+	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+
+	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = FALSE;
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+
+	return (0);
+}
+
+void *
+restore_read(struct restorearg *ra, int len)
+{
+	void *rv;
+
+	/* some things will require 8-byte alignment, so everything must */
+	ASSERT3U(len % 8, ==, 0);
+
+	while (ra->buflen - ra->bufoff < len) {
+		ssize_t resid;
+		int leftover = ra->buflen - ra->bufoff;
+
+		(void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+		ra->err = vn_rdwr(UIO_READ, ra->vp,
+		    (caddr_t)ra->buf + leftover, ra->bufsize - leftover,
+		    ra->voff, UIO_SYSSPACE, FAPPEND,
+		    RLIM_INFINITY, CRED(), &resid);
+
+		/* Need to compute checksum */
+
+		ra->voff += ra->bufsize - leftover - resid;
+		ra->buflen = ra->bufsize - resid;
+		ra->bufoff = 0;
+		if (resid == ra->bufsize - leftover)
+			ra->err = EINVAL;
+		if (ra->err)
+			return (NULL);
+	}
+
+	ASSERT3U(ra->bufoff % 8, ==, 0);
+	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
+	rv = ra->buf + ra->bufoff;
+	ra->bufoff += len;
+	return (rv);
+}
+
+static void
+backup_byteswap(dmu_replay_record_t *drr)
+{
+#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+	drr->drr_type = BSWAP_32(drr->drr_type);
+	switch (drr->drr_type) {
+	case DRR_BEGIN:
+		DO64(drr_begin.drr_magic);
+		DO64(drr_begin.drr_version);
+		DO64(drr_begin.drr_creation_time);
+		DO32(drr_begin.drr_type);
+		DO64(drr_begin.drr_toguid);
+		DO64(drr_begin.drr_fromguid);
+		break;
+	case DRR_OBJECT:
+		DO64(drr_object.drr_object);
+		/* DO64(drr_object.drr_allocation_txg); */
+		DO32(drr_object.drr_type);
+		DO32(drr_object.drr_bonustype);
+		DO32(drr_object.drr_blksz);
+		DO32(drr_object.drr_bonuslen);
+		break;
+	case DRR_FREEOBJECTS:
+		DO64(drr_freeobjects.drr_firstobj);
+		DO64(drr_freeobjects.drr_numobjs);
+		break;
+	case DRR_WRITE:
+		DO64(drr_write.drr_object);
+		DO32(drr_write.drr_type);
+		DO64(drr_write.drr_offset);
+		DO64(drr_write.drr_length);
+		break;
+	case DRR_FREE:
+		DO64(drr_free.drr_object);
+		DO64(drr_free.drr_offset);
+		DO64(drr_free.drr_length);
+		break;
+	case DRR_END:
+		DO64(drr_end.drr_checksum);
+		break;
+	}
+#undef DO64
+#undef DO32
+}
+
+static int
+restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+{
+	int err;
+	dmu_tx_t *tx;
+
+	err = dmu_object_info(os, drro->drr_object, NULL);
+
+	if (err != 0 && err != ENOENT)
+		return (EINVAL);
+
+	if (drro->drr_type == DMU_OT_NONE ||
+	    drro->drr_type >= DMU_OT_NUMTYPES ||
+	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
+	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+		return (EINVAL);
+	}
+
+	tx = dmu_tx_create(os);
+
+	if (err == ENOENT) {
+		/* currently free, want to be allocated */
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+		err = dmu_object_claim(os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	} else {
+		/* currently allocated, want to be allocated */
+		dmu_tx_hold_bonus(tx, drro->drr_object);
+		/*
+		 * We may change blocksize, so need to
+		 * hold_write
+		 */
+		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+
+		err = dmu_object_reclaim(os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	}
+	if (err) {
+		dmu_tx_commit(tx);
+		return (EINVAL);
+	}
+
+	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+
+	if (drro->drr_bonuslen) {
+		dmu_buf_t *db;
+		void *data;
+		db = dmu_bonus_hold(os, drro->drr_object);
+		dmu_buf_will_dirty(db, tx);
+
+		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
+		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+		if (data == NULL) {
+			dmu_tx_commit(tx);
+			return (ra->err);
+		}
+		bcopy(data, db->db_data, db->db_size);
+		if (ra->byteswap) {
+			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+			    drro->drr_bonuslen);
+		}
+		dmu_buf_rele(db);
+	}
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_freeobjects(struct restorearg *ra, objset_t *os,
+    struct drr_freeobjects *drrfo)
+{
+	uint64_t obj;
+
+	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+		return (EINVAL);
+
+	for (obj = drrfo->drr_firstobj;
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) {
+		dmu_tx_t *tx;
+		int err;
+
+		if (dmu_object_info(os, obj, NULL) != 0)
+			continue;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, obj);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+		err = dmu_object_free(os, obj, tx);
+		dmu_tx_commit(tx);
+		if (err && err != ENOENT)
+			return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+restore_write(struct restorearg *ra, objset_t *os,
+    struct drr_write *drrw)
+{
+	dmu_tx_t *tx;
+	void *data;
+	int err;
+
+	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+	    drrw->drr_type >= DMU_OT_NUMTYPES)
+		return (EINVAL);
+
+	data = restore_read(ra, drrw->drr_length);
+	if (data == NULL)
+		return (ra->err);
+
+	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+		return (EINVAL);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	if (ra->byteswap)
+		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+	dmu_write(os, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length, data, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_free(struct restorearg *ra, objset_t *os,
+    struct drr_free *drrf)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrf->drr_length != -1ULL &&
+	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+		return (EINVAL);
+
+	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+		return (EINVAL);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_free_range(os, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+int
+dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+    vnode_t *vp, uint64_t voffset)
+{
+	struct restorearg ra;
+	dmu_replay_record_t *drr;
+	char *cp, *tosnap;
+	dsl_dir_t *dd = NULL;
+	objset_t *os = NULL;
+
+	bzero(&ra, sizeof (ra));
+	ra.vp = vp;
+	ra.voff = voffset;
+	ra.bufsize = 1<<20;
+	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+
+	if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		ra.byteswap = FALSE;
+	} else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+		ra.byteswap = TRUE;
+	} else {
+		ra.err = EINVAL;
+		goto out;
+	}
+
+	if (ra.byteswap) {
+		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+		drrb->drr_version = BSWAP_64(drrb->drr_version);
+		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+		drrb->drr_type = BSWAP_32(drrb->drr_type);
+		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+	}
+
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+	tosnap = drrb->drr_toname;
+	if (drrb->drr_version != DMU_BACKUP_VERSION ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    strchr(drrb->drr_toname, '@') == NULL) {
+		ra.err = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Process the begin in syncing context.
+	 */
+	if (drrb->drr_fromguid) {
+		/* incremental backup */
+
+		cp = strchr(tosnap, '@');
+		*cp = '\0';
+		dd = dsl_dir_open(tosnap, FTAG, NULL);
+		*cp = '@';
+		if (dd == NULL) {
+			ra.err = ENOENT;
+			goto out;
+		}
+
+		ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
+		    drrb, 1<<20);
+	} else {
+		/* full backup */
+		const char *tail;
+
+		cp = strchr(tosnap, '@');
+		*cp = '\0';
+		dd = dsl_dir_open(tosnap, FTAG, &tail);
+		*cp = '@';
+		if (dd == NULL) {
+			ra.err = ENOENT;
+			goto out;
+		}
+		if (tail == NULL) {
+			ra.err = EEXIST;
+			goto out;
+		}
+
+		ra.err = dsl_dir_sync_task(dd, replay_full_sync,
+		    drrb, 1<<20);
+	}
+	if (ra.err)
+		goto out;
+
+	/*
+	 * Open the objset we are modifying.
+	 */
+
+	cp = strchr(tosnap, '@');
+	*cp = '\0';
+	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
+	    DS_MODE_PRIMARY | DS_MODE_RESTORE, &os);
+	*cp = '@';
+	ASSERT3U(ra.err, ==, 0);
+
+	/*
+	 * Read records and process them.
+	 */
+	while (ra.err == 0 &&
+	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+		if (issig(JUSTLOOKING)) {
+			ra.err = EINTR;
+			goto out;
+		}
+
+		if (ra.byteswap)
+			backup_byteswap(drr);
+
+		switch (drr->drr_type) {
+		case DRR_OBJECT:
+		{
+			/*
+			 * We need to make a copy of the record header,
+			 * because restore_{object,write} may need to
+			 * restore_read(), which will invalidate drr.
+			 */
+			struct drr_object drro = drr->drr_u.drr_object;
+			ra.err = restore_object(&ra, os, &drro);
+			break;
+		}
+		case DRR_FREEOBJECTS:
+		{
+			struct drr_freeobjects drrfo =
+			    drr->drr_u.drr_freeobjects;
+			ra.err = restore_freeobjects(&ra, os, &drrfo);
+			break;
+		}
+		case DRR_WRITE:
+		{
+			struct drr_write drrw = drr->drr_u.drr_write;
+			ra.err = restore_write(&ra, os, &drrw);
+			break;
+		}
+		case DRR_FREE:
+		{
+			struct drr_free drrf = drr->drr_u.drr_free;
+			ra.err = restore_free(&ra, os, &drrf);
+			break;
+		}
+		case DRR_END:
+			/* Need to verify checksum. */
+			/*
+			 * dd may be the parent of the dd we are
+			 * restoring into (eg. if it's a full backup).
+			 */
+			ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
+			    ds_dir, replay_end_sync, drrb, 1<<20);
+			goto out;
+		default:
+			ra.err = EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	if (os)
+		dmu_objset_close(os);
+
+	/*
+	 * Make sure we don't rollback/destroy unless we actually
+	 * processed the begin properly.  'os' will only be set if this
+	 * is the case.
+	 */
+	if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) {
+		/*
+		 * rollback or destroy what we created, so we don't
+		 * leave it in the restoring state.
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
+		if (drrb->drr_fromguid) {
+			/* incremental: rollback to most recent snapshot */
+			(void) dsl_dir_sync_task(dd,
+			    dsl_dataset_rollback_sync, NULL, 0);
+		} else {
+			/* full: destroy whole fs */
+			cp = strchr(tosnap, '@');
+			*cp = '\0';
+			cp = strchr(tosnap, '/');
+			if (cp) {
+				(void) dsl_dir_sync_task(dd,
+				    dsl_dir_destroy_sync, cp+1, 0);
+			}
+			cp = strchr(tosnap, '\0');
+			*cp = '@';
+		}
+
+	}
+
+	if (dd)
+		dsl_dir_close(dd, FTAG);
+	kmem_free(ra.buf, ra.bufsize);
+	if (sizep)
+		*sizep = ra.voff;
+	return (ra.err);
+}
+
+/*
+ * Intent log support: sync the block at <os, object, offset> to disk.
+ * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
+ * of the same block, and for making sure that the data isn't changing
+ * while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ *	EALREADY: this txg has already been synced, so there's nothing to to.
+ *		The caller should not log the write.
+ *
+ *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	EINPROGRESS: the block is in the process of being synced by the
+ *		usual mechanism (spa_sync()), so we can't sync it here.
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	EBUSY: another thread is trying to dmu_sync() the same dbuf.
+ *		(This case cannot arise under the current locking rules.)
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	ESTALE: the block was dirtied or freed while we were writing it,
+ *		so the data is no longer valid.
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	0: success.  Sets *bp to the blkptr just written, and sets
+ *		*blkoff to the data's offset within that block.
+ *		The caller should log this blkptr/blkoff in its lr_write_t.
+ */
+int
+dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+    blkptr_t *bp, uint64_t txg)
+{
+	dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+	tx_state_t *tx = &dp->dp_tx;
+	dmu_buf_impl_t *db;
+	blkptr_t *blk;
+	int err;
+
+	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
+	ASSERT(BP_IS_HOLE(bp));
+	ASSERT(txg != 0);
+
+	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+	/*
+	 * If this txg already synced, there's nothing to do.
+	 */
+	if (txg <= tx->tx_synced_txg) {
+		/*
+		 * If we're running ziltest, we need the blkptr regardless.
+		 */
+		if (txg > spa_freeze_txg(dp->dp_spa)) {
+			db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+			/* if db_blkptr == NULL, this was an empty write */
+			if (db->db_blkptr)
+				*bp = *db->db_blkptr; /* structure assignment */
+			else
+				bzero(bp, sizeof (blkptr_t));
+			*blkoff = offset - db->db.db_offset;
+			ASSERT3U(*blkoff, <, db->db.db_size);
+			dmu_buf_rele((dmu_buf_t *)db);
+			return (0);
+		}
+		return (EALREADY);
+	}
+
+	/*
+	 * If this txg is in the middle of syncing, just wait for it.
+	 */
+	if (txg == tx->tx_syncing_txg) {
+		ASSERT(txg != tx->tx_open_txg);
+		return (EINPROGRESS);
+	}
+
+	db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+
+	mutex_enter(&db->db_mtx);
+
+	/*
+	 * If this dbuf isn't dirty, must have been free_range'd.
+	 * There's no need to log writes to freed blocks, so we're done.
+	 */
+	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
+		mutex_exit(&db->db_mtx);
+		dmu_buf_rele((dmu_buf_t *)db);
+		return (ENOENT);
+	}
+
+	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
+
+	/*
+	 * If we already did a dmu_sync() of this dbuf in this txg,
+	 * free the old block before writing the new one.
+	 */
+	if (blk != NULL) {
+		ASSERT(blk != IN_DMU_SYNC);
+		if (blk == IN_DMU_SYNC) {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_rele((dmu_buf_t *)db);
+			return (EBUSY);
+		}
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+		if (!BP_IS_HOLE(blk)) {
+			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(blk, sizeof (blkptr_t));
+	}
+
+	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
+	mutex_exit(&db->db_mtx);
+
+	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	blk->blk_birth = 0; /* mark as invalid */
+
+	err = arc_write(NULL, os->os->os_spa,
+	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
+	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	ASSERT(err == 0);
+
+	if (!BP_IS_HOLE(blk)) {
+		blk->blk_fill = 1;
+		BP_SET_TYPE(blk, db->db_dnode->dn_type);
+		BP_SET_LEVEL(blk, 0);
+	}
+
+	/* copy the block pointer back to caller */
+	*bp = *blk; /* structure assignment */
+	*blkoff = offset - db->db.db_offset;
+	ASSERT3U(*blkoff, <, db->db.db_size);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
+		/* we were dirtied/freed during the sync */
+		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+		mutex_exit(&db->db_mtx);
+		dmu_buf_rele((dmu_buf_t *)db);
+		/* Note that this block does not free on disk until txg syncs */
+
+		/*
+		 * XXX can we use ARC_NOWAIT here?
+		 * XXX should we be ignoring the return code?
+		 */
+		if (!BP_IS_HOLE(blk)) {
+			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(blk, sizeof (blkptr_t));
+		return (ESTALE);
+	}
+
+	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+	mutex_exit(&db->db_mtx);
+	dmu_buf_rele((dmu_buf_t *)db);
+	ASSERT3U(txg, >, tx->tx_syncing_txg);
+	return (0);
+}
+
+uint64_t
+dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	uint64_t rv = dnode_max_nonzero_offset(dn);
+	dnode_rele(dn, FTAG);
+	return (rv);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	int err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	dn->dn_checksum = checksum;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+	dn->dn_compress = compress;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+	dnode_t *dn;
+	int i, err;
+
+	dn = dnode_hold(os->os, object, FTAG);
+	/*
+	 * Sync any current changes before
+	 * we go trundling through the block pointers.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_dirtyblksz[i])
+			break;
+	}
+	if (i != TXG_SIZE) {
+		dnode_rele(dn, FTAG);
+		txg_wait_synced(dmu_objset_pool(os), 0);
+		dn = dnode_hold(os->os, object, FTAG);
+	}
+
+	err = dnode_next_offset(dn, hole, off, 1, 1);
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	mutex_enter(&dn->dn_mtx);
+
+	doi->doi_data_block_size = dn->dn_datablksz;
+	doi->doi_metadata_block_size = dn->dn_indblkshift ?
+	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_indirection = dn->dn_nlevels;
+	doi->doi_checksum = dn->dn_checksum;
+	doi->doi_compress = dn->dn_compress;
+	doi->doi_physical_blks = dn->dn_phys->dn_secphys;
+	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_size = dn->dn_bonuslen;
+	doi->doi_bonus_type = dn->dn_bonustype;
+
+	mutex_exit(&dn->dn_mtx);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+
+	if (dn == NULL)
+		return (ENOENT);
+
+	if (doi != NULL)
+		dmu_object_info_from_dnode(dn, doi);
+
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+	*blksize = dn->dn_datablksz;
+	*nblk512 = dn->dn_phys->dn_secphys + 1;	/* add 1 for dnode space */
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+	uint64_t *buf = vbuf;
+	size_t count = size >> 3;
+	int i;
+
+	ASSERT((size & 7) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+	uint32_t *buf = vbuf;
+	size_t count = size >> 2;
+	int i;
+
+	ASSERT((size & 3) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+	uint16_t *buf = vbuf;
+	size_t count = size >> 1;
+	int i;
+
+	ASSERT((size & 1) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+	dbuf_init();
+	dnode_init();
+	arc_init();
+}
+
+void
+dmu_fini(void)
+{
+	arc_fini();
+	dnode_fini();
+	dbuf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 0000000000..d150d6c400
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	objset_impl_t *osi = os->os;
+	uint64_t object;
+	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	dnode_t *dn;
+	int restarted = B_FALSE;
+
+	mutex_enter(&osi->os_obj_lock);
+	for (;;) {
+		object = osi->os_obj_next;
+		/*
+		 * Each time we polish off an L2 bp worth of dnodes
+		 * (2^13 objects), move to another L2 bp that's still
+		 * reasonably sparse (at most 1/4 full).  Look from the
+		 * beginning once, but after that keep looking from here.
+		 * If we can't find one, just keep going from here.
+		 */
+		if (P2PHASE(object, L2_dnode_count) == 0) {
+			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+			int error = dnode_next_offset(osi->os_meta_dnode,
+			    B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2);
+			restarted = B_TRUE;
+			if (error == 0)
+				object = offset >> DNODE_SHIFT;
+		}
+		osi->os_obj_next = ++object;
+
+		dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+		if (dn)
+			break;
+
+		if (dmu_object_next(os, &object, B_TRUE) == 0)
+			osi->os_obj_next = object - 1;
+	}
+
+	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	mutex_exit(&osi->os_obj_lock);
+
+	dmu_tx_add_new_object(tx, os, object);
+	return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+		return (EBADF);
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+	if (dn == NULL)
+		return (EEXIST);
+	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	dmu_tx_add_new_object(tx, os, object);
+	return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+		return (EBADF);
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+	if (dn == NULL)
+		return (EBADF);
+	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+	if (dn == NULL)
+		return (ENOENT);
+
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+	dnode_free(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole)
+{
+	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+	int error;
+
+	error = dnode_next_offset(os->os->os_meta_dnode,
+	    hole, &offset, 0, DNODES_PER_BLOCK);
+
+	*objectp = offset >> DNODE_SHIFT;
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 0000000000..9bb621b9a1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,727 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+	return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+	return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+	dsl_dataset_t *ds;
+
+	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+		return (ds->ds_dir->dd_pool);
+	else
+		return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+	return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+	return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+	dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+	return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+	objset_phys_t *osp = buf;
+
+	ASSERT(size == sizeof (objset_phys_t));
+	dnode_byteswap(&osp->os_meta_dnode);
+	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+	osp->os_type = BSWAP_64(osp->os_type);
+}
+
+objset_impl_t *
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+{
+	objset_impl_t *winner, *osi;
+	int i, err, checksum;
+
+	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+	osi->os.os = osi;
+	osi->os_dsl_dataset = ds;
+	osi->os_spa = spa;
+	if (bp)
+		osi->os_rootbp = *bp;
+	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
+	if (!BP_IS_HOLE(&osi->os_rootbp)) {
+		dprintf_bp(&osi->os_rootbp, "reading %s", "");
+		(void) arc_read(NULL, spa, &osi->os_rootbp,
+		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+		    arc_bcopy_func, osi->os_phys,
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	} else {
+		bzero(osi->os_phys, sizeof (objset_phys_t));
+	}
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+	/*
+	 * Note: the changed_cb will be called once before the register
+	 * func returns, thus changing the checksum/compression from the
+	 * default (fletcher2/off).
+	 */
+	if (ds) {
+		err = dsl_prop_register(ds, "checksum",
+		    checksum_changed_cb, osi);
+		ASSERT(err == 0);
+
+		err = dsl_prop_register(ds, "compression",
+		    compression_changed_cb, osi);
+		ASSERT(err == 0);
+	} else {
+		/* It's the meta-objset. */
+		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+		osi->os_compress = ZIO_COMPRESS_LZJB;
+	}
+
+	/*
+	 * Metadata always gets compressed and checksummed.
+	 * If the data checksum is multi-bit correctable, and it's not
+	 * a ZBT-style checksum, then it's suitable for metadata as well.
+	 * Otherwise, the metadata checksum defaults to fletcher4.
+	 */
+	checksum = osi->os_checksum;
+
+	if (zio_checksum_table[checksum].ci_correctable &&
+	    !zio_checksum_table[checksum].ci_zbt)
+		osi->os_md_checksum = checksum;
+	else
+		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+	osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[i]));
+		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[i]));
+	}
+	list_create(&osi->os_dnodes, sizeof (dnode_t),
+	    offsetof(dnode_t, dn_link));
+	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	osi->os_meta_dnode = dnode_special_open(osi,
+	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+	if (ds != NULL) {
+		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
+		if (winner) {
+			dmu_objset_evict(ds, osi);
+			osi = winner;
+		}
+	}
+
+	return (osi);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+	objset_t *os;
+	objset_impl_t *osi;
+
+	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+	err = dsl_dataset_open(name, mode, os, &ds);
+	if (err) {
+		kmem_free(os, sizeof (objset_t));
+		return (err);
+	}
+
+	osi = dsl_dataset_get_user_ptr(ds);
+	if (osi == NULL) {
+		blkptr_t bp;
+
+		dsl_dataset_get_blkptr(ds, &bp);
+		osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+	}
+
+	os->os = osi;
+	os->os_mode = mode;
+
+	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
+		dmu_objset_close(os);
+		return (EINVAL);
+	}
+	*osp = os;
+	return (0);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+	kmem_free(os, sizeof (objset_t));
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+	objset_impl_t *osi = arg;
+	int err, i;
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+	}
+
+	if (ds) {
+		err = dsl_prop_unregister(ds, "checksum",
+		    checksum_changed_cb, osi);
+		ASSERT(err == 0);
+
+		err = dsl_prop_unregister(ds, "compression",
+		    compression_changed_cb, osi);
+		ASSERT(err == 0);
+	}
+
+	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+	dnode_special_close(osi->os_meta_dnode);
+	zil_free(osi->os_zil);
+
+	zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+	kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
+    dmu_tx_t *tx)
+{
+	objset_impl_t *osi;
+	dnode_t *mdn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	osi = dmu_objset_open_impl(spa, ds, NULL);
+	mdn = osi->os_meta_dnode;
+
+	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+	/*
+	 * We don't want to have to increase the meta-dnode's nlevels
+	 * later, because then we could do it in quescing context while
+	 * we are also accessing it in open context.
+	 *
+	 * This precaution is not necessary for the MOS (ds == NULL),
+	 * because the MOS is only updated in syncing context.
+	 * This is most fortunate: the MOS is the only objset that
+	 * needs to be synced multiple times as spa_sync() iterates
+	 * to convergence, so minimizing its dn_nlevels matters.
+	 */
+	if (ds != NULL)
+		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+		    mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+
+	ASSERT(type != DMU_OST_NONE);
+	ASSERT(type != DMU_OST_ANY);
+	ASSERT(type < DMU_OST_NUMTYPES);
+	osi->os_phys->os_type = type;
+
+	dsl_dataset_dirty(ds, tx);
+
+	return (osi);
+}
+
+struct oscarg {
+	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	void *userarg;
+	dsl_dataset_t *clone_parent;
+	const char *fullname;
+	const char *lastname;
+	dmu_objset_type_t type;
+};
+
+static int
+dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct oscarg *oa = arg;
+	dsl_dataset_t *ds;
+	int err;
+	blkptr_t bp;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname,
+	    oa->clone_parent, tx);
+	dprintf_dd(dd, "fn=%s ln=%s err=%d\n",
+	    oa->fullname, oa->lastname, err);
+	if (err)
+		return (err);
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+	dsl_dataset_get_blkptr(ds, &bp);
+	if (BP_IS_HOLE(&bp)) {
+		objset_impl_t *osi;
+
+		/* This is an empty dmu_objset; not a clone. */
+		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+		    ds, oa->type, tx);
+
+		if (oa->userfunc)
+			oa->userfunc(&osi->os, oa->userarg, tx);
+	}
+	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+	return (0);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+{
+	dsl_dir_t *pds;
+	const char *tail;
+	int err = 0;
+
+	pds = dsl_dir_open(name, FTAG, &tail);
+	if (pds == NULL)
+		return (ENOENT);
+	if (tail == NULL) {
+		dsl_dir_close(pds, FTAG);
+		return (EEXIST);
+	}
+
+	dprintf("name=%s\n", name);
+
+	if (tail[0] == '@') {
+		/*
+		 * If we're creating a snapshot, make sure everything
+		 * they might want is on disk.  XXX Sketchy to know
+		 * about snapshots here, better to put in DSL.
+		 */
+		objset_t *os;
+		size_t plen = strchr(name, '@') - name + 1;
+		char *pbuf = kmem_alloc(plen, KM_SLEEP);
+		bcopy(name, pbuf, plen - 1);
+		pbuf[plen - 1] = '\0';
+
+		err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+		if (err == 0) {
+			err = zil_suspend(dmu_objset_zil(os));
+			if (err == 0) {
+				err = dsl_dir_sync_task(pds,
+				    dsl_dataset_snapshot_sync,
+				    (void*)(tail+1), 16*1024);
+				zil_resume(dmu_objset_zil(os));
+			}
+			dmu_objset_close(os);
+		}
+		kmem_free(pbuf, plen);
+	} else {
+		struct oscarg oa = { 0 };
+		oa.userfunc = func;
+		oa.userarg = arg;
+		oa.fullname = name;
+		oa.lastname = tail;
+		oa.type = type;
+		if (clone_parent != NULL) {
+			/*
+			 * You can't clone to a different type.
+			 */
+			if (clone_parent->os->os_phys->os_type != type) {
+				dsl_dir_close(pds, FTAG);
+				return (EINVAL);
+			}
+			oa.clone_parent = clone_parent->os->os_dsl_dataset;
+		}
+		err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa,
+		    256*1024);
+	}
+	dsl_dir_close(pds, FTAG);
+	return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+	objset_t *os;
+	int error;
+
+	/*
+	 * If it looks like we'll be able to destroy it, and there's
+	 * an unplayed replay log sitting around, destroy the log.
+	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
+	 * but the replay log objset is modified in open context.
+	 */
+	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+	if (error == 0) {
+		zil_destroy(dmu_objset_zil(os));
+		dmu_objset_close(os);
+	}
+
+	/* XXX uncache everything? */
+	return (dsl_dataset_destroy(name));
+}
+
+int
+dmu_objset_rollback(const char *name)
+{
+	int err;
+	objset_t *os;
+
+	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+	if (err == 0) {
+		err = zil_suspend(dmu_objset_zil(os));
+		if (err == 0)
+			zil_resume(dmu_objset_zil(os));
+		dmu_objset_close(os);
+		if (err == 0) {
+			/* XXX uncache everything? */
+			err = dsl_dataset_rollback(name);
+		}
+	}
+	return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
+{
+	dnode_t *dn = list_head(list);
+	int level, err;
+
+	for (level = 0; dn = list_head(list); level++) {
+		zio_t *zio;
+		zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+		ASSERT3U(level, <=, DN_MAX_LEVELS);
+
+		while (dn) {
+			dnode_t *next = list_next(list, dn);
+
+			list_remove(list, dn);
+			if (dnode_sync(dn, level, zio, tx) == 0) {
+				/*
+				 * This dnode requires syncing at higher
+				 * levels; put it back onto the list.
+				 */
+				if (next)
+					list_insert_before(list, next, dn);
+				else
+					list_insert_tail(list, dn);
+			}
+			dn = next;
+		}
+		err = zio_wait(zio);
+		ASSERT(err == 0);
+	}
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	objset_impl_t *os = arg;
+	objset_phys_t *osphys = zio->io_data;
+	dnode_phys_t *dnp = &osphys->os_meta_dnode;
+	int i;
+
+	ASSERT3U(zio->io_error, ==, 0);
+
+	/*
+	 * Update rootbp fill count.
+	 */
+	os->os_rootbp.blk_fill = 1;	/* count the meta-dnode */
+	for (i = 0; i < dnp->dn_nblkptr; i++)
+		os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
+	BP_SET_LEVEL(zio->io_bp, 0);
+
+	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+	    BP_IDENTITY(&zio->io_bp_orig))) {
+		dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
+		    os->os_synctx);
+		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
+		    os->os_synctx);
+	}
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
+{
+	extern taskq_t *dbuf_tq;
+	int txgoff;
+	list_t *dirty_list;
+	int err;
+	arc_buf_t *abuf =
+	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(os->os_synctx == NULL);
+	/* XXX the write_done callback should really give us the tx... */
+	os->os_synctx = tx;
+
+	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+	txgoff = tx->tx_txg & TXG_MASK;
+
+	dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
+	dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+
+	/*
+	 * Sync meta-dnode
+	 */
+	dirty_list = &os->os_dirty_dnodes[txgoff];
+	ASSERT(list_head(dirty_list) == NULL);
+	list_insert_tail(dirty_list, os->os_meta_dnode);
+	dmu_objset_sync_dnodes(os, dirty_list, tx);
+
+	/*
+	 * Sync the root block.
+	 */
+	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
+	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	ASSERT(err == 0);
+	arc_buf_free(abuf, FTAG);
+
+	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
+
+	ASSERT3P(os->os_synctx, ==, tx);
+	taskq_wait(dbuf_tq);
+	os->os_synctx = NULL;
+}
+
+void
+dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds)
+{
+	if (os->os->os_dsl_dataset != NULL) {
+		dsl_dataset_stats(os->os->os_dsl_dataset, dds);
+	} else {
+		ASSERT(os->os->os_phys->os_type == DMU_OST_META);
+		bzero(dds, sizeof (*dds));
+	}
+	dds->dds_type = os->os->os_phys->os_type;
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+	if (os->os->os_dsl_dataset != NULL)
+		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+	else
+		return (B_FALSE);
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *id, uint64_t *offp)
+{
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+
+	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+		return (ENOENT);
+
+	zap_cursor_init_serialized(&cursor,
+	    ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+	if (zap_cursor_retrieve(&cursor, &attr) != 0)
+		return (ENOENT);
+
+	if (strlen(attr.za_name) + 1 > namelen)
+		return (ENAMETOOLONG);
+
+	(void) strcpy(name, attr.za_name);
+	*id = attr.za_first_integer;
+	zap_cursor_advance(&cursor);
+	*offp = zap_cursor_serialize(&cursor);
+
+	return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+void
+dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
+{
+	dsl_dir_t *dd;
+	objset_t *os;
+	uint64_t snapobj;
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	char *child;
+	int do_self;
+
+	dd = dsl_dir_open(name, FTAG, NULL);
+	if (dd == NULL)
+		return;
+
+	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (dd->dd_phys->dd_child_dir_zapobj != 0) {
+		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, &attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT(attr.za_integer_length == sizeof (uint64_t));
+			ASSERT(attr.za_num_integers == 1);
+
+			/*
+			 * No separating '/' because parent's name ends in /.
+			 */
+			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+			/* XXX could probably just use name here */
+			dsl_dir_name(dd, child);
+			(void) strcat(child, "/");
+			(void) strcat(child, attr.za_name);
+			dmu_objset_find(child, func, arg, flags);
+			kmem_free(child, MAXPATHLEN);
+		}
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if ((flags & DS_FIND_SNAPSHOTS) &&
+	    dmu_objset_open(name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+		dmu_objset_close(os);
+
+		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+		    zap_cursor_retrieve(&zc, &attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT(attr.za_integer_length == sizeof (uint64_t));
+			ASSERT(attr.za_num_integers == 1);
+
+			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+			/* XXX could probably just use name here */
+			dsl_dir_name(dd, child);
+			(void) strcat(child, "@");
+			(void) strcat(child, attr.za_name);
+			func(child, arg);
+			kmem_free(child, MAXPATHLEN);
+		}
+	}
+
+	dsl_dir_close(dd, FTAG);
+
+	/*
+	 * Apply to self if appropriate.
+	 */
+	if (do_self)
+		func(name, arg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 0000000000..036e3965cf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,792 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+
+#define	BP_SPAN_SHIFT(level, width)	((level) * (width))
+
+#define	BP_EQUAL(b1, b2)				\
+	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
+	(b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ *	object 0, 1, 2, ..., ZB_MAXOBJECT.
+ *	blkoff 0, 1, 2, ...
+ *	level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ *	< objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ *	object 1, 2, ..., ZB_MAXOBJECT, 0.
+ *	blkoff 1, 2, ...
+ *	level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ *	< objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ *	< objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+    int advance)
+{
+	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+	uint64_t sblkoff, eblkoff;
+	int slevel, elevel, wshift;
+
+	if (szb->zb_objset + bias < ezb->zb_objset + bias)
+		return (-1);
+
+	if (szb->zb_objset + bias > ezb->zb_objset + bias)
+		return (1);
+
+	slevel = szb->zb_level;
+	elevel = ezb->zb_level;
+
+	if ((slevel | elevel) < 0)
+		return ((slevel ^ bias) - (elevel ^ bias));
+
+	if (szb->zb_object + bias < ezb->zb_object + bias)
+		return (-1);
+
+	if (szb->zb_object + bias > ezb->zb_object + bias)
+		return (1);
+
+	if (dnp == NULL)
+		return (0);
+
+	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+	if (sblkoff < eblkoff)
+		return (-1);
+
+	if (sblkoff > eblkoff)
+		return (1);
+
+	return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
+{							\
+	(zb)->zb_objset = objset;			\
+	(zb)->zb_object = object;			\
+	(zb)->zb_level = level;				\
+	(zb)->zb_blkid = blkid;				\
+}
+
+#define	SET_BOOKMARK_LB(zb, level, blkid)		\
+{							\
+	(zb)->zb_level = level;				\
+	(zb)->zb_blkid = blkid;				\
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	if (advance & ADVANCE_PRE) {
+		if (objset >= ZB_MAXOBJSET)
+			return (ERANGE);
+		SET_BOOKMARK(zb, objset, 0, -1, 0);
+	} else {
+		if (objset >= ZB_MAXOBJSET)
+			objset = 0;
+		SET_BOOKMARK(zb, objset, 1, 0, 0);
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	if (advance & ADVANCE_PRE) {
+		if (object >= ZB_MAXOBJECT) {
+			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+		} else {
+			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+		}
+	} else {
+		if (zb->zb_object == 0) {
+			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+		} else {
+			if (object >= ZB_MAXOBJECT)
+				object = 0;
+			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+		}
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	ASSERT(zb->zb_object == 0);
+	ASSERT(zb->zb_level == -1);
+	ASSERT(zb->zb_blkid == 0);
+
+	if (advance & ADVANCE_PRE) {
+		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+	} else {
+		if (zb->zb_objset == 0)
+			return (ERANGE);
+		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int maxlevel = dnp->dn_nlevels - 1;
+	int level = zb->zb_level;
+	uint64_t blkid = zb->zb_blkid;
+
+	if (advance & ADVANCE_PRE) {
+		if (level > 0 && rc == 0) {
+			level--;
+			blkid <<= wshift;
+		} else {
+			blkid++;
+
+			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+			    dnp->dn_maxblkid)
+				return (ERANGE);
+
+			while (level < maxlevel) {
+				if (P2PHASE(blkid, 1ULL << wshift))
+					break;
+				blkid >>= wshift;
+				level++;
+			}
+		}
+	} else {
+		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+			level = 0;
+		} else {
+			blkid >>= wshift;
+			level++;
+		}
+
+		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+		    dnp->dn_maxblkid) {
+			if (level == maxlevel)
+				return (ERANGE);
+			blkid >>= wshift;
+			level++;
+		}
+	}
+	SET_BOOKMARK_LB(zb, level, blkid);
+
+	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+	/*
+	 * Before we issue the callback, prune against maxtxg.
+	 *
+	 * We prune against mintxg before we get here because it's a big win.
+	 * If a given block was born in txg 37, then we know that the entire
+	 * subtree below that block must have been born in txg 37 or earlier.
+	 * We can therefore lop off huge branches of the tree as we go.
+	 *
+	 * There's no corresponding optimization for maxtxg because knowing
+	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+	 * children.  In fact, the copy-on-write design of ZFS ensures that
+	 * top-level blocks will pretty much always be new.
+	 *
+	 * Therefore, in the name of simplicity we don't prune against
+	 * maxtxg until the last possible moment -- that being right now.
+	 */
+	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+		return (0);
+
+	if (bc->bc_errno == 0) {
+		zbookmark_t *zb = &bc->bc_bookmark;
+		zbookmark_t *szb = &zseg->seg_start;
+		zbookmark_t *ezb = &zseg->seg_end;
+		zbookmark_t *lzb = &th->th_lastcb;
+		dnode_phys_t *dnp = bc->bc_dnode;
+
+		/*
+		 * Debugging: verify that the order we visit things
+		 * agrees with the order defined by compare_bookmark().
+		 */
+		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+		    lzb->zb_level == ZB_NO_LEVEL);
+		*lzb = *zb;
+	}
+
+	th->th_callbacks++;
+	return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+	dnode_phys_t *dnp)
+{
+	zbookmark_t *zb = &bc->bc_bookmark;
+	int error;
+
+	th->th_hits++;
+
+	bc->bc_dnode = dnp;
+	bc->bc_errno = 0;
+
+	if (BP_EQUAL(&bc->bc_blkptr, bp))
+		return (0);
+
+	bc->bc_blkptr = *bp;
+
+	if (bc->bc_data == NULL)
+		return (0);
+
+	if (BP_IS_HOLE(bp)) {
+		ASSERT(th->th_advance & ADVANCE_HOLES);
+		return (0);
+	}
+
+	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+		error = EIO;
+	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+		error = 0;
+		th->th_arc_hits++;
+	} else {
+		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+
+		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+			(zb->zb_level > 0 ? byteswap_uint64_array :
+			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+			    BP_GET_LSIZE(bp));
+		th->th_reads++;
+	}
+
+	if (error) {
+		bc->bc_errno = error;
+		error = traverse_callback(th, NULL, bc);
+		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+		bc->bc_blkptr.blk_birth = -1ULL;
+	}
+
+	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+	    bc - &th->th_cache[0][0], error,
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+	return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	traverse_blk_cache_t *bc;
+	blkptr_t *bp = dnp->dn_blkptr;
+	int i, first, level;
+	int nbp = dnp->dn_nblkptr;
+	int minlevel = zb->zb_level;
+	int maxlevel = dnp->dn_nlevels - 1;
+	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+	uint64_t blkid = zb->zb_blkid >> bp_shift;
+	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+	int rc;
+
+	if (minlevel > maxlevel || blkid >= nbp)
+		return (ERANGE);
+
+	for (level = maxlevel; level >= minlevel; level--) {
+		first = P2PHASE(blkid, 1ULL << wshift);
+
+		for (i = first; i < nbp; i++)
+			if (bp[i].blk_birth > zseg->seg_mintxg ||
+			    BP_IS_HOLE(&bp[i]) && do_holes)
+				break;
+
+		if (i != first) {
+			i--;
+			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+			return (ENOTBLK);
+		}
+
+		bc = &th->th_cache[depth][level];
+
+		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+		    level, blkid);
+
+		if (rc = traverse_read(th, bc, bp + i, dnp)) {
+			if (rc != EAGAIN) {
+				SET_BOOKMARK_LB(zb, level, blkid);
+			}
+			return (rc);
+		}
+
+		if (BP_IS_HOLE(&bp[i])) {
+			SET_BOOKMARK_LB(zb, level, blkid);
+			th->th_lastcb.zb_level = ZB_NO_LEVEL;
+			return (0);
+		}
+
+		nbp = 1 << wshift;
+		bp = bc->bc_data;
+		bp_shift -= wshift;
+		blkid = zb->zb_blkid >> bp_shift;
+	}
+
+	return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+    uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+	zseg_t zseg;
+	zbookmark_t *zb = &zseg.seg_start;
+	uint64_t object = *objectp;
+	int i, rc;
+
+	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+	zseg.seg_mintxg = txg;
+	zseg.seg_maxtxg = -1ULL;
+
+	for (;;) {
+		rc = find_block(th, &zseg, mdn, depth);
+
+		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+			break;
+
+		if (rc == 0 && zb->zb_level == 0) {
+			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+			for (i = 0; i < DNODES_PER_BLOCK; i++) {
+				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+				if (object >= *objectp &&
+				    dnp[i].dn_type != DMU_OT_NONE &&
+				    (type == -1 || dnp[i].dn_type == type)) {
+					*objectp = object;
+					*dnpp = &dnp[i];
+					return (0);
+				}
+			}
+		}
+
+		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+		if (rc == ERANGE)
+			break;
+	}
+
+	if (rc == ERANGE)
+		*objectp = ZB_MAXOBJECT;
+
+	return (rc);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	traverse_blk_cache_t *bc;
+	dnode_phys_t *dn, *dn_tmp;
+	int worklimit = 1000;
+	int rc;
+
+	dprintf("<%llu, %llu, %d, %llx>\n",
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+	rc = traverse_read(th, bc, mosbp, dn);
+
+	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
+		return (rc == ERESTART ? EINTR : rc);
+
+	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+	if (zb->zb_objset != 0) {
+		uint64_t objset = zb->zb_objset;
+		dsl_dataset_phys_t *dsp;
+
+		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+		    DMU_OT_DSL_OBJSET, ZB_MOS_CACHE);
+
+		if (objset != zb->zb_objset)
+			rc = advance_objset(zseg, objset, th->th_advance);
+
+		if (rc != 0)
+			return (rc);
+
+		dsp = DN_BONUS(dn_tmp);
+
+		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+		if (rc != 0) {
+			if (rc == ERESTART)
+				rc = advance_objset(zseg, zb->zb_objset + 1,
+				    th->th_advance);
+			return (rc);
+		}
+
+		if (th->th_advance & ADVANCE_PRUNE)
+			zseg->seg_mintxg =
+			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+	}
+
+	if (zb->zb_level == -1) {
+		ASSERT(zb->zb_object == 0);
+
+		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+			rc = traverse_callback(th, zseg, bc);
+			if (rc) {
+				ASSERT(rc == EINTR);
+				return (rc);
+			}
+		}
+
+		return (advance_from_osphys(zseg, th->th_advance));
+	}
+
+	if (zb->zb_object != 0) {
+		uint64_t object = zb->zb_object;
+
+		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+		if (object != zb->zb_object)
+			rc = advance_object(zseg, object, th->th_advance);
+
+		if (rc != 0)
+			return (rc);
+
+		dn = dn_tmp;
+	}
+
+	if (zb->zb_level == ZB_MAXLEVEL)
+		zb->zb_level = dn->dn_nlevels - 1;
+
+	for (;;) {
+		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+			break;
+
+		if (rc == 0) {
+			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+			ASSERT(bc->bc_dnode == dn);
+			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+			rc = traverse_callback(th, zseg, bc);
+			if (rc) {
+				ASSERT(rc == EINTR);
+				return (rc);
+			}
+			if (BP_IS_HOLE(&bc->bc_blkptr)) {
+				ASSERT(th->th_advance & ADVANCE_HOLES);
+				rc = ENOTBLK;
+			}
+		}
+
+		rc = advance_block(zseg, dn, rc, th->th_advance);
+
+		if (rc == ERANGE)
+			break;
+
+		/*
+		 * Give spa_sync() a chance to run.
+		 */
+		if (spa_traverse_wanted(th->th_spa)) {
+			th->th_syncs++;
+			return (EAGAIN);
+		}
+
+		if (--worklimit == 0)
+			return (EAGAIN);
+	}
+
+	if (rc == ERANGE)
+		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+	return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+    blkptr_cb_t func, void *arg)
+{
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+	traverse_handle_t *th;
+	int err;
+
+	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+	while ((err = traverse_more(th)) == EAGAIN)
+		continue;
+
+	traverse_fini(th);
+	return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+	zseg_t *zseg = list_head(&th->th_seglist);
+	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
+	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+	int rc;
+
+	if (zseg == NULL)
+		return (0);
+
+	th->th_restarts++;
+
+	save_txg = zseg->seg_mintxg;
+
+	if (!(th->th_advance & ADVANCE_NOLOCK))
+		rw_enter(rw, RW_READER);
+
+	rc = traverse_segment(th, zseg, mosbp);
+	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+	if (!(th->th_advance & ADVANCE_NOLOCK))
+		rw_exit(rw);
+
+	zseg->seg_mintxg = save_txg;
+
+	if (rc == ERANGE) {
+		list_remove(&th->th_seglist, zseg);
+		kmem_free(zseg, sizeof (*zseg));
+		return (EAGAIN);
+	}
+
+	return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included.  The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+    uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+	zseg_t *zseg;
+
+	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+	zseg->seg_mintxg = mintxg;
+	zseg->seg_maxtxg = maxtxg;
+
+	zseg->seg_start.zb_objset = sobjset;
+	zseg->seg_start.zb_object = sobject;
+	zseg->seg_start.zb_level = slevel;
+	zseg->seg_start.zb_blkid = sblkid;
+
+	zseg->seg_end.zb_objset = eobjset;
+	zseg->seg_end.zb_object = eobject;
+	zseg->seg_end.zb_level = elevel;
+	zseg->seg_end.zb_blkid = eblkid;
+
+	list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t objset, uint64_t object)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, object, ZB_MAXLEVEL, 0,
+		    objset, object, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, object, 0, 0,
+		    objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t objset)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, 0, -1, 0,
+		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, 1, 0, 0,
+		    objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    0, 0, -1, 0,
+		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    1, 1, 0, 0,
+		    0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+    int zio_flags)
+{
+	traverse_handle_t *th;
+	int d, l;
+
+	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+	th->th_spa = spa;
+	th->th_func = func;
+	th->th_arg = arg;
+	th->th_advance = advance;
+	th->th_lastcb.zb_level = ZB_NO_LEVEL;
+	th->th_noread.zb_level = ZB_NO_LEVEL;
+	th->th_zio_flags = zio_flags;
+
+	list_create(&th->th_seglist, sizeof (zseg_t),
+	    offsetof(zseg_t, seg_node));
+
+	for (d = 0; d < ZB_DEPTH; d++) {
+		for (l = 0; l < ZB_MAXLEVEL; l++) {
+			if ((advance & ADVANCE_DATA) ||
+			    l != 0 || d != ZB_DN_CACHE)
+				th->th_cache[d][l].bc_data =
+				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
+		}
+	}
+
+	return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+	int d, l;
+	zseg_t *zseg;
+
+	for (d = 0; d < ZB_DEPTH; d++)
+		for (l = 0; l < ZB_MAXLEVEL; l++)
+			if (th->th_cache[d][l].bc_data != NULL)
+				zio_buf_free(th->th_cache[d][l].bc_data,
+				    SPA_MAXBLOCKSIZE);
+
+	while ((zseg = list_head(&th->th_seglist)) != NULL) {
+		list_remove(&th->th_seglist, zseg);
+		kmem_free(zseg, sizeof (*zseg));
+	}
+
+	list_destroy(&th->th_seglist);
+
+	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+	    th->th_syncs, th->th_restarts);
+
+	kmem_free(th, sizeof (*th));
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 0000000000..5dd827e946
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,801 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h>	/* for ZAP_BLOCK_SHIFT */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef ZFS_DEBUG
+int dmu_use_tx_debug_bufs = 1;
+#endif
+
+dmu_tx_t *
+dmu_tx_create_ds(dsl_dir_t *dd)
+{
+	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+	tx->tx_dir = dd;
+	if (dd)
+		tx->tx_pool = dd->dd_pool;
+	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+	    offsetof(dmu_tx_hold_t, dth_node));
+	refcount_create(&tx->tx_space_written);
+	refcount_create(&tx->tx_space_freed);
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+	dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
+	tx->tx_objset = os;
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+	dmu_tx_t *tx = dmu_tx_create_ds(NULL);
+
+	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+	tx->tx_pool = dp;
+	tx->tx_txg = txg;
+	tx->tx_anyobj = TRUE;
+
+	return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj || tx->tx_privateobj);
+}
+
+static void
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+    enum dmu_tx_hold_type type, dmu_tx_hold_func_t func,
+    uint64_t arg1, uint64_t arg2)
+{
+	dmu_tx_hold_t *dth;
+	dnode_t *dn = NULL;
+
+	if (object != DMU_NEW_OBJECT) {
+		dn = dnode_hold(os->os, object, tx);
+
+		if (tx->tx_txg != 0) {
+			mutex_enter(&dn->dn_mtx);
+			/*
+			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+			 * problem, but there's no way for it to happen (for
+			 * now, at least).
+			 */
+			ASSERT(dn->dn_assigned_txg == 0);
+			ASSERT(dn->dn_assigned_tx == NULL);
+			dn->dn_assigned_txg = tx->tx_txg;
+			dn->dn_assigned_tx = tx;
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+	}
+
+	dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+	dth->dth_dnode = dn;
+	dth->dth_type = type;
+	dth->dth_func = func;
+	dth->dth_arg1 = arg1;
+	dth->dth_arg2 = arg2;
+	/*
+	 * XXX Investigate using a different data structure to keep
+	 * track of dnodes in a tx.  Maybe array, since there will
+	 * generally not be many entries?
+	 */
+	list_insert_tail(&tx->tx_holds, dth);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+	/*
+	 * If we're syncing, they can manipulate any object anyhow, and
+	 * the hold on the dnode_t can cause problems.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT,
+		    NULL, 0, 0);
+	}
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	uint64_t start, end, space;
+	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+
+	if (len == 0)
+		return;
+
+	min_bs = SPA_MINBLOCKSHIFT;
+	max_bs = SPA_MAXBLOCKSHIFT;
+	min_ibs = DN_MIN_INDBLKSHIFT;
+	max_ibs = DN_MAX_INDBLKSHIFT;
+
+	/*
+	 * If there's more than one block, the blocksize can't change,
+	 * so we can make a more precise estimate.  Alternatively,
+	 * if the dnode's ibs is larger than max_ibs, always use that.
+	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+	 * the code will still work correctly on existing pools.
+	 */
+	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+		min_ibs = max_ibs = dn->dn_indblkshift;
+		if (dn->dn_datablkshift != 0)
+			min_bs = max_bs = dn->dn_datablkshift;
+	}
+
+	/*
+	 * 'end' is the last thing we will access, not one past.
+	 * This way we won't overflow when accessing the last byte.
+	 */
+	start = P2ALIGN(off, 1ULL << max_bs);
+	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+	space = end - start + 1;
+
+	start >>= min_bs;
+	end >>= min_bs;
+
+	epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+	/*
+	 * The object contains at most 2^(64 - min_bs) blocks,
+	 * and each indirect level maps 2^epbs.
+	 */
+	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+		start >>= epbs;
+		end >>= epbs;
+		/*
+		 * If we increase the number of levels of indirection,
+		 * we'll need new blkid=0 indirect blocks.  If start == 0,
+		 * we're already accounting for that blocks; and if end == 0,
+		 * we can't increase the number of levels beyond that.
+		 */
+		if (start != 0 && end != 0)
+			space += 1ULL << max_ibs;
+		space += (end - start + 1) << max_ibs;
+	}
+
+	ASSERT(space < 2 * DMU_MAX_ACCESS);
+
+	tx->tx_space_towrite += space;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+	dnode_t *mdn = tx->tx_objset->os->os_meta_dnode;
+	uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1;
+	uint64_t pre_write_space;
+
+	ASSERT(object < DN_MAX_OBJECT);
+	pre_write_space = tx->tx_space_towrite;
+	dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
+	if (dn && dn->dn_dbuf->db_blkptr &&
+	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+	    dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+		tx->tx_space_tooverwrite +=
+			tx->tx_space_towrite - pre_write_space;
+		tx->tx_space_towrite = pre_write_space;
+	}
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	dmu_tx_count_write(tx, dn, off, len);
+	dmu_tx_count_dnode(tx, dn);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+	ASSERT(UINT64_MAX - off >= len - 1);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
+	    dmu_tx_hold_write_impl, off, len);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	uint64_t blkid, nblks;
+	uint64_t space = 0;
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+	ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
+
+	if (dn->dn_datablkshift == 0)
+		return;
+	/*
+	 * not that the dnode can change, since it isn't dirty, but
+	 * dbuf_hold_impl() wants us to have the struct_rwlock.
+	 * also need it to protect dn_maxblkid.
+	 */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = off >> dn->dn_datablkshift;
+	nblks = (off + len) >> dn->dn_datablkshift;
+
+	if (blkid >= dn->dn_maxblkid)
+		goto out;
+	if (blkid + nblks > dn->dn_maxblkid)
+		nblks = dn->dn_maxblkid - blkid;
+
+	/* don't bother after the 100,000 blocks */
+	nblks = MIN(nblks, 128*1024);
+
+	if (dn->dn_phys->dn_nlevels == 1) {
+		int i;
+		for (i = 0; i < nblks; i++) {
+			blkptr_t *bp = dn->dn_phys->dn_blkptr;
+			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+			bp += blkid + i;
+			if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+				dprintf_bp(bp, "can free old%s", "");
+				space += BP_GET_ASIZE(bp);
+			}
+		}
+		goto out;
+	}
+
+	while (nblks) {
+		dmu_buf_impl_t *dbuf;
+		int err, epbs, blkoff, tochk;
+
+		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		blkoff = P2PHASE(blkid, 1<<epbs);
+		tochk = MIN((1<<epbs) - blkoff, nblks);
+
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+		if (err == 0) {
+			int i;
+			blkptr_t *bp;
+
+			dbuf_read_havestruct(dbuf);
+
+			bp = dbuf->db.db_data;
+			bp += blkoff;
+
+			for (i = 0; i < tochk; i++) {
+				if (dsl_dataset_block_freeable(ds,
+				    bp[i].blk_birth, tx)) {
+					dprintf_bp(&bp[i],
+					    "can free old%s", "");
+					space += BP_GET_ASIZE(&bp[i]);
+				}
+			}
+			dbuf_remove_ref(dbuf, FTAG);
+		} else {
+			/* the indirect block is sparse */
+			ASSERT(err == ENOENT);
+		}
+
+		blkid += tochk;
+		nblks -= tochk;
+	}
+out:
+	rw_exit(&dn->dn_struct_rwlock);
+
+	tx->tx_space_tofree += space;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	int dirty;
+
+	/* first block */
+	if (off != 0 /* || dn->dn_maxblkid == 0 */)
+		dmu_tx_count_write(tx, dn, off, 1);
+	/* last block */
+	if (len != DMU_OBJECT_END)
+		dmu_tx_count_write(tx, dn, off+len, 1);
+
+	dmu_tx_count_dnode(tx, dn);
+
+	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+		return;
+	if (len == DMU_OBJECT_END)
+		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+	/* XXX locking */
+	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
+	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
+	if (dn->dn_assigned_tx != NULL && !dirty)
+		dmu_tx_count_free(tx, dn, off, len);
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE,
+	    dmu_tx_hold_free_impl, off, len);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+{
+	uint64_t nblocks;
+	int epbs;
+
+	dmu_tx_count_dnode(tx, dn);
+
+	if (dn == NULL) {
+		/*
+		 * Assuming that nops+cops is not super huge, we will be
+		 * able to fit a new object's entries into one leaf
+		 * block.  So there will be at most 2 blocks total,
+		 * including the header block.
+		 */
+		dmu_tx_count_write(tx, dn, 0, 2 << ZAP_BLOCK_SHIFT);
+		return;
+	}
+
+	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+	if (dn->dn_maxblkid == 0 && nops == 0) {
+		/*
+		 * If there is only one block  (i.e. this is a micro-zap)
+		 * and we are only doing updates, the accounting is simple.
+		 */
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+			tx->tx_space_tooverwrite += dn->dn_datablksz;
+		else
+			tx->tx_space_towrite += dn->dn_datablksz;
+		return;
+	}
+
+	/*
+	 * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
+	 * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+	 */
+	dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
+	    (nops * 6ULL + cops * 3ULL) << ZAP_BLOCK_SHIFT);
+
+	/*
+	 * If the modified blocks are scattered to the four winds,
+	 * we'll have to modify an indirect twig for each.
+	 */
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+		tx->tx_space_towrite +=
+		    ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
+	    dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS,
+	    dmu_tx_hold_write_impl, 0, 0);
+}
+
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn,
+    uint64_t space, uint64_t unused)
+{
+	tx->tx_space_towrite += space;
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE,
+	    dmu_tx_hold_space_impl, space, 0);
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *dth;
+	int holds = 0;
+
+	/*
+	 * By asserting that the tx is assigned, we're counting the
+	 * number of dn_tx_holds, which is the same as the number of
+	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
+	 * dn_tx_holds could be 0.
+	 */
+	ASSERT(tx->tx_txg != 0);
+
+	/* if (tx->tx_anyobj == TRUE) */
+		/* return (0); */
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    dth = list_next(&tx->tx_holds, dth)) {
+		if (dth->dth_dnode && dth->dth_dnode->dn_object == object)
+			holds++;
+	}
+
+	return (holds);
+}
+
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+	dmu_tx_hold_t *dth;
+	int match_object = FALSE, match_offset = FALSE;
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+	ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+	if (tx->tx_anyobj)
+		return;
+
+	/* XXX No checking on the meta dnode for now */
+	if (db->db.db_object & DMU_PRIVATE_OBJECT)
+		return;
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    dth = list_next(&tx->tx_holds, dth)) {
+		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+		if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT)
+			match_object = TRUE;
+		if (dth->dth_dnode == NULL || dth->dth_dnode == dn) {
+			int datablkshift = dn->dn_datablkshift ?
+			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			int shift = datablkshift + epbs * db->db_level;
+			uint64_t beginblk = shift >= 64 ? 0 :
+			    (dth->dth_arg1 >> shift);
+			uint64_t endblk = shift >= 64 ? 0 :
+			    ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift);
+			uint64_t blkid = db->db_blkid;
+
+			/* XXX dth_arg2 better not be zero... */
+
+			dprintf("found dth type %x beginblk=%llx endblk=%llx\n",
+			    dth->dth_type, beginblk, endblk);
+
+			switch (dth->dth_type) {
+			case THT_WRITE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				/*
+				 * We will let this hold work for the bonus
+				 * buffer so that we don't need to hold it
+				 * when creating a new object.
+				 */
+				if (blkid == DB_BONUS_BLKID)
+					match_offset = TRUE;
+				/*
+				 * They might have to increase nlevels,
+				 * thus dirtying the new TLIBs.  Or the
+				 * might have to change the block size,
+				 * thus dirying the new lvl=0 blk=0.
+				 */
+				if (blkid == 0)
+					match_offset = TRUE;
+				break;
+			case THT_FREE:
+				if (blkid == beginblk &&
+				    (dth->dth_arg1 != 0 ||
+				    dn->dn_maxblkid == 0))
+					match_offset = TRUE;
+				if (blkid == endblk &&
+				    dth->dth_arg2 != DMU_OBJECT_END)
+					match_offset = TRUE;
+				break;
+			case THT_BONUS:
+				if (blkid == DB_BONUS_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_ZAP:
+				match_offset = TRUE;
+				break;
+			case THT_NEWOBJECT:
+				match_object = TRUE;
+				break;
+			default:
+				ASSERT(!"bad dth_type");
+			}
+		}
+		if (match_object && match_offset)
+			return;
+	}
+	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+	    (u_longlong_t)db->db.db_object, db->db_level,
+	    (u_longlong_t)db->db_blkid);
+#endif
+}
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
+{
+	dmu_tx_hold_t *dth;
+	uint64_t lsize, asize, fsize;
+
+	*last_dth = NULL;
+
+	tx->tx_space_towrite = 0;
+	tx->tx_space_tofree = 0;
+	tx->tx_space_tooverwrite = 0;
+	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+
+	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+		return (ERESTART);
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+		dnode_t *dn = dth->dth_dnode;
+		if (dn != NULL) {
+			mutex_enter(&dn->dn_mtx);
+			while (dn->dn_assigned_txg == tx->tx_txg - 1) {
+				if (txg_how != TXG_WAIT) {
+					mutex_exit(&dn->dn_mtx);
+					return (ERESTART);
+				}
+				cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+			}
+			if (dn->dn_assigned_txg == 0) {
+				ASSERT(dn->dn_assigned_tx == NULL);
+				dn->dn_assigned_txg = tx->tx_txg;
+				dn->dn_assigned_tx = tx;
+			} else {
+				ASSERT(dn->dn_assigned_txg == tx->tx_txg);
+				if (dn->dn_assigned_tx != tx)
+					dn->dn_assigned_tx = NULL;
+			}
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+		if (dth->dth_func)
+			dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+	}
+
+	/*
+	 * Convert logical size to worst-case allocated size.
+	 */
+	fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) +
+	    tx->tx_space_tofree;
+	lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
+	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	tx->tx_space_towrite = asize;
+
+	if (tx->tx_dir && asize != 0) {
+		int err = dsl_dir_tempreserve_space(tx->tx_dir,
+		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+		if (err)
+			return (err);
+	}
+
+	return (0);
+}
+
+static uint64_t
+dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth)
+{
+	uint64_t txg = tx->tx_txg;
+	dmu_tx_hold_t *dth;
+
+	ASSERT(txg != 0);
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			dn->dn_assigned_tx = NULL;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	txg_rele_to_sync(&tx->tx_txgh);
+
+	tx->tx_txg = 0;
+	return (txg);
+}
+
+/*
+ * Assign tx to a transaction group.  txg_how can be one of:
+ *
+ * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
+ *	a new one.  This should be used when you're not holding locks.
+ *	If will only fail if we're truly out of space (or over quota).
+ *
+ * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
+ *	blocking, returns immediately with ERESTART.  This should be used
+ *	whenever you're holding locks.  On an ERESTART error, the caller
+ *	should drop locks, do a txg_wait_open(dp, 0), and try again.
+ *
+ * (3)	A specific txg.  Use this if you need to ensure that multiple
+ *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
+ *	returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+	dmu_tx_hold_t *last_dth;
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(txg_how != 0);
+	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+	ASSERT3U(tx->tx_space_towrite, ==, 0);
+	ASSERT3U(tx->tx_space_tofree, ==, 0);
+
+	while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
+		uint64_t txg = dmu_tx_unassign(tx, last_dth);
+
+		if (err != ERESTART || txg_how != TXG_WAIT)
+			return (err);
+
+		txg_wait_open(tx->tx_pool, txg + 1);
+	}
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	return (0);
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+	if (tx->tx_dir == NULL || delta == 0)
+		return;
+
+	if (delta > 0) {
+		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+		    tx->tx_space_towrite);
+		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+	} else {
+		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+	}
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *dth;
+
+	ASSERT(tx->tx_txg != 0);
+
+	while (dth = list_head(&tx->tx_holds)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		list_remove(&tx->tx_holds, dth);
+		kmem_free(dth, sizeof (dmu_tx_hold_t));
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			dn->dn_assigned_tx = NULL;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+		dnode_rele(dn, tx);
+	}
+
+	if (tx->tx_dir && tx->tx_space_towrite > 0) {
+		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+	}
+
+	if (tx->tx_anyobj == FALSE)
+		txg_rele_to_sync(&tx->tx_txgh);
+	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+	if (tx->tx_debug_buf)
+		kmem_free(tx->tx_debug_buf, 4096);
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *dth;
+
+	ASSERT(tx->tx_txg == 0);
+
+	while (dth = list_head(&tx->tx_holds)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		list_remove(&tx->tx_holds, dth);
+		kmem_free(dth, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+	if (tx->tx_debug_buf)
+		kmem_free(tx->tx_debug_buf, 4096);
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	return (tx->tx_txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 0000000000..cfaeaf0674
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,603 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+/* max # of streams per zfetch */
+uint32_t	zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t	zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t	zfetch_block_cap = 32;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t	zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int		dmu_zfetch_find(zfetch_t *, zstream_t *);
+static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
+static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static void		dmu_zfetch_stream_update(zfetch_t *, zstream_t *);
+static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear to a pair of existing prefetch
+ * streams.  If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+	zstream_t	*z_walk;
+	zstream_t	*z_comp;
+
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+	if (zh == NULL) {
+		rw_exit(&zf->zf_rwlock);
+		return (0);
+	}
+
+	for (z_walk = list_head(&zf->zf_stream); z_walk;
+	    z_walk = list_next(&zf->zf_stream, z_walk)) {
+		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+		    z_comp = list_next(&zf->zf_stream, z_comp)) {
+			int64_t		diff;
+
+			if (z_walk->zst_len != z_walk->zst_stride ||
+			    z_comp->zst_len != z_comp->zst_stride) {
+				continue;
+			}
+
+			diff = z_comp->zst_offset - z_walk->zst_offset;
+			if (z_comp->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ? -1 : 1;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+
+			diff = z_walk->zst_offset - z_comp->zst_offset;
+			if (z_walk->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ? -1 : 1;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+		}
+	}
+
+	rw_exit(&zf->zf_rwlock);
+	return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch.  Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+	uint64_t	prefetch_tail;
+	uint64_t	prefetch_limit;
+	uint64_t	prefetch_ofst;
+	uint64_t	prefetch_len;
+	uint64_t	blocks_fetched;
+
+	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+	    (int64_t)(zs->zst_offset + zs->zst_stride));
+	/*
+	 * XXX: use a faster division method?
+	 */
+	prefetch_limit = zs->zst_offset + zs->zst_len +
+	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+	while (prefetch_tail < prefetch_limit) {
+		prefetch_ofst = zs->zst_offset + zs->zst_direction *
+		    (prefetch_tail - zs->zst_offset);
+
+		prefetch_len = zs->zst_len;
+
+		/*
+		 * Don't prefetch beyond the end of the file, if working
+		 * backwards.
+		 */
+		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+		    (prefetch_ofst > prefetch_tail)) {
+			prefetch_len += prefetch_ofst;
+			prefetch_ofst = 0;
+		}
+
+		/* don't prefetch more than we're supposed to */
+		if (prefetch_len > zs->zst_len)
+			break;
+
+		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+		    prefetch_ofst, zs->zst_len);
+
+		prefetch_tail += zs->zst_stride;
+		/* stop if we've run out of stuff to prefetch */
+		if (blocks_fetched < zs->zst_len)
+			break;
+	}
+	zs->zst_ph_offset = prefetch_tail;
+	zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode.  It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+	if (zf == NULL) {
+		return;
+	}
+
+	zf->zf_dnode = dno;
+	zf->zf_stream_cnt = 0;
+	zf->zf_alloc_fail = 0;
+
+	list_create(&zf->zf_stream, sizeof (zstream_t),
+	    offsetof(zstream_t, zst_node));
+
+	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+	uint64_t	i;
+
+	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+	for (i = 0; i < fetchsz; i++) {
+		dbuf_prefetch(dn, blkid + i);
+	}
+
+	return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks.  This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact.  This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+
+	if (blkid > dn->dn_maxblkid) {
+		return (0);
+	}
+
+	/* compute fetch size */
+	if (blkid + nblks > dn->dn_maxblkid) {
+		fetchsz = dn->dn_maxblkid - blkid;
+		ASSERT(blkid + fetchsz <= dn->dn_maxblkid);
+	} else {
+		fetchsz = nblks;
+	}
+
+
+	return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read.  If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh)
+{
+	zstream_t	*zs;
+	int64_t		diff;
+	int		rc = 0;
+
+	if (zh == NULL)
+		return (0);
+
+	/*
+	 * XXX: This locking strategy is a bit coarse; however, it's impact has
+	 * yet to be tested.  If this turns out to be an issue, it can be
+	 * modified in a number of different ways.
+	 */
+
+	rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+
+		if (zs->zst_len == 0) {
+			/* bogus stream */
+			continue;
+		}
+
+		if (zh->zst_offset - zs->zst_offset < zs->zst_len) {
+			/* already fetched */
+			rw_exit(&zf->zf_rwlock);
+			return (1);
+		}
+
+		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+			/* forward sequential access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_len += zh->zst_len;
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_offset += diff;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : 0;
+			}
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+			/* backwards sequential access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zh->zst_len ?
+			    zs->zst_offset - zh->zst_len : 0;
+			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+			    zs->zst_ph_offset - zh->zst_len : 0;
+			zs->zst_len += zh->zst_len;
+
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+				    zs->zst_ph_offset - diff : 0;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : zs->zst_len;
+			}
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided forward access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset += zs->zst_stride;
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided reverse access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+			    zs->zst_offset - zs->zst_stride : 0;
+			zs->zst_ph_offset = (zs->zst_ph_offset >
+			    (2 * zs->zst_stride)) ?
+			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+		}
+	}
+
+	if (zs) {
+		rc = 1;
+		dmu_zfetch_dofetch(zf, zs);
+		mutex_exit(&zs->zst_lock);
+	}
+
+	rw_exit(&zf->zf_rwlock);
+	return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure.  This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+	zstream_t	*zs;
+	zstream_t	*zs_next;
+
+	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+
+		list_remove(&zf->zf_stream, zs);
+		mutex_destroy(&zs->zst_lock);
+		kmem_free(zs, sizeof (zstream_t));
+	}
+	list_destroy(&zf->zf_stream);
+	rw_destroy(&zf->zf_rwlock);
+
+	zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure.  Peform the appropriate
+ * book-keeping.  It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case.  If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+	zstream_t	*zs_walk;
+	zstream_t	*zs_next;
+
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs_walk);
+
+		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+		    return (0);
+		}
+	}
+
+	list_insert_head(&zf->zf_stream, zs);
+	zf->zf_stream_cnt++;
+
+	return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+	zstream_t	*zs;
+
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+			break;
+	}
+
+	if (zs) {
+		dmu_zfetch_stream_remove(zf, zs);
+		mutex_destroy(&zs->zst_lock);
+		bzero(zs, sizeof (zstream_t));
+	} else {
+		zf->zf_alloc_fail++;
+	}
+	rw_exit(&zf->zf_rwlock);
+
+	return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure.  Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	list_remove(&zf->zf_stream, zs);
+	zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+	if (zs1->zst_offset != zs2->zst_offset)
+		return (0);
+
+	if (zs1->zst_len != zs2->zst_len)
+		return (0);
+
+	if (zs1->zst_stride != zs2->zst_stride)
+		return (0);
+
+	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+		return (0);
+
+	if (zs1->zst_cap != zs2->zst_cap)
+		return (0);
+
+	if (zs1->zst_direction != zs2->zst_direction)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * This is the prefetch entry point.  It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size)
+{
+	zstream_t	zst;
+	zstream_t	*newstream;
+	int		fetched;
+	int		inserted;
+	unsigned int	blkshft;
+	uint64_t	blksz;
+
+	/* files that aren't ln2 blocksz are only one block -- nothing to do */
+	if (!zf->zf_dnode->dn_datablkshift) {
+		return;
+	}
+
+	/* convert offset and size, into blockid and nblocks */
+	blkshft = zf->zf_dnode->dn_datablkshift;
+	blksz = (1 << blkshft);
+
+	bzero(&zst, sizeof (zstream_t));
+	zst.zst_offset = offset >> blkshft;
+	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+	    P2ALIGN(offset, blksz)) >> blkshft;
+
+	fetched = dmu_zfetch_find(zf, &zst);
+	if (!fetched) {
+		fetched = dmu_zfetch_colinear(zf, &zst);
+	}
+
+	if (!fetched) {
+		newstream = dmu_zfetch_stream_reclaim(zf);
+
+		/*
+		 * we still couldn't find a stream, drop the lock, and allocate
+		 * one if possible.  Otherwise, give up and go home.
+		 */
+		if (newstream == NULL) {
+			uint64_t	maxblocks;
+			uint32_t	max_streams;
+			uint32_t	cur_streams;
+
+			cur_streams = zf->zf_stream_cnt;
+			maxblocks = zf->zf_dnode->dn_maxblkid;
+
+			max_streams = MIN(zfetch_max_streams,
+			    (maxblocks / zfetch_block_cap));
+			if (max_streams == 0) {
+				max_streams++;
+			}
+
+			if (cur_streams >= max_streams) {
+				return;
+			}
+
+			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+		}
+
+		newstream->zst_offset = zst.zst_offset;
+		newstream->zst_len = zst.zst_len;
+		newstream->zst_stride = zst.zst_len;
+		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+		newstream->zst_cap = zst.zst_len;
+		newstream->zst_direction = ZFETCH_FORWARD;
+		newstream->zst_last = lbolt;
+
+		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+		rw_enter(&zf->zf_rwlock, RW_WRITER);
+		inserted = dmu_zfetch_stream_insert(zf, newstream);
+		rw_exit(&zf->zf_rwlock);
+
+		if (!inserted) {
+			mutex_destroy(&newstream->zst_lock);
+			kmem_free(newstream, sizeof (zstream_t));
+		}
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
new file mode 100644
index 0000000000..6b25b35ab1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,1304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+	int i;
+	dnode_t *dn = arg;
+	bzero(dn, sizeof (dnode_t));
+
+	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+	refcount_create(&dn->dn_holds);
+	refcount_create(&dn->dn_tx_holds);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		avl_create(&dn->dn_ranges[i], free_range_compar,
+		    sizeof (free_range_t),
+		    offsetof(struct free_range, fr_node));
+		list_create(&dn->dn_dirty_dbufs[i],
+		    sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_dirty_node[i]));
+	}
+
+	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+	int i;
+	dnode_t *dn = arg;
+
+	rw_destroy(&dn->dn_struct_rwlock);
+	mutex_destroy(&dn->dn_mtx);
+	mutex_destroy(&dn->dn_dbufs_mtx);
+	refcount_destroy(&dn->dn_holds);
+	refcount_destroy(&dn->dn_tx_holds);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		avl_destroy(&dn->dn_ranges[i]);
+		list_destroy(&dn->dn_dirty_dbufs[i]);
+	}
+
+	list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+	dnode_cache = kmem_cache_create("dnode_t",
+	    sizeof (dnode_t),
+	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+	kmem_cache_destroy(dnode_cache);
+}
+
+
+void
+dnode_verify(dnode_t *dn)
+{
+#ifdef ZFS_DEBUG
+	int drop_struct_lock = FALSE;
+
+	ASSERT(dn->dn_phys);
+	ASSERT(dn->dn_objset);
+
+	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+		int i;
+		ASSERT3U(dn->dn_indblkshift, >=, 0);
+		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+		if (dn->dn_datablkshift) {
+			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+		}
+		ASSERT3U(dn->dn_nlevels, <=, 30);
+		ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+		ASSERT3U(dn->dn_nblkptr, >=, 1);
+		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		ASSERT3U(dn->dn_datablksz, ==,
+		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		for (i = 0; i < TXG_SIZE; i++) {
+			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+		}
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE)
+		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+	if (dn->dn_dbuf != NULL) {
+		ASSERT3P(dn->dn_phys, ==,
+		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+	}
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+#endif
+}
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+	int i;
+
+	if (dnp->dn_type == DMU_OT_NONE) {
+		bzero(dnp, sizeof (dnode_phys_t));
+		return;
+	}
+
+	dnp->dn_type = BSWAP_8(dnp->dn_type);
+	dnp->dn_indblkshift = BSWAP_8(dnp->dn_indblkshift);
+	dnp->dn_nlevels = BSWAP_8(dnp->dn_nlevels);
+	dnp->dn_nblkptr = BSWAP_8(dnp->dn_nblkptr);
+	dnp->dn_bonustype = BSWAP_8(dnp->dn_bonustype);
+	dnp->dn_checksum = BSWAP_8(dnp->dn_checksum);
+	dnp->dn_compress = BSWAP_8(dnp->dn_compress);
+	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+	dnp->dn_secphys = BSWAP_64(dnp->dn_secphys);
+
+	/*
+	 * dn_nblkptr is only one byte, so it's OK to read it in either
+	 * byte order.  We can't read dn_bouslen.
+	 */
+	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+		buf64[i] = BSWAP_64(buf64[i]);
+
+	/*
+	 * OK to check dn_bonuslen for zero, because it won't matter if
+	 * we have the wrong byte order.  This is necessary because the
+	 * dnode dnode is smaller than a regular dnode.
+	 */
+	if (dnp->dn_bonuslen != 0) {
+		/*
+		 * Note that the bonus length calculated here may be
+		 * longer than the actual bonus buffer.  This is because
+		 * we always put the bonus buffer after the last block
+		 * pointer (instead of packing it against the end of the
+		 * dnode buffer).
+		 */
+		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+		size_t len = DN_MAX_BONUSLEN - off;
+		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+	}
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+	dnode_phys_t *buf = vbuf;
+	int i;
+
+	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+	size >>= DNODE_SHIFT;
+	for (i = 0; i < size; i++) {
+		dnode_byteswap(buf);
+		buf++;
+	}
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+	const free_range_t *rp1 = node1;
+	const free_range_t *rp2 = node2;
+
+	if (rp1->fr_blkid < rp2->fr_blkid)
+		return (-1);
+	else if (rp1->fr_blkid > rp2->fr_blkid)
+		return (1);
+	else return (0);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+	ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+	dn->dn_datablksz = size;
+	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+	dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+    uint64_t object)
+{
+	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	(void) dnode_cons(dn, NULL, 0); /* XXX */
+
+	dn->dn_objset = os;
+	dn->dn_object = object;
+	dn->dn_dbuf = db;
+	dn->dn_phys = dnp;
+
+	if (dnp->dn_datablkszsec)
+		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	dn->dn_indblkshift = dnp->dn_indblkshift;
+	dn->dn_nlevels = dnp->dn_nlevels;
+	dn->dn_type = dnp->dn_type;
+	dn->dn_nblkptr = dnp->dn_nblkptr;
+	dn->dn_checksum = dnp->dn_checksum;
+	dn->dn_compress = dnp->dn_compress;
+	dn->dn_bonustype = dnp->dn_bonustype;
+	dn->dn_bonuslen = dnp->dn_bonuslen;
+	dn->dn_maxblkid = dnp->dn_maxblkid;
+
+	dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	mutex_enter(&os->os_lock);
+	list_insert_head(&os->os_dnodes, dn);
+	mutex_exit(&os->os_lock);
+
+	return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+	objset_impl_t *os = dn->dn_objset;
+
+	mutex_enter(&os->os_lock);
+	list_remove(&os->os_dnodes, dn);
+	mutex_exit(&os->os_lock);
+
+	if (dn->dn_dirtyctx_firstset) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+	dmu_zfetch_rele(&dn->dn_zfetch);
+	kmem_cache_free(dnode_cache, dn);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int i;
+
+	if (blocksize == 0)
+		blocksize = 1 << zfs_default_bs;
+
+	blocksize = MIN(MAX(blocksize, SPA_MINBLOCKSIZE), SPA_MAXBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = zfs_default_ibs;
+
+	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+	    dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+	ASSERT(ot != DMU_OT_NONE);
+	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT3U(dn->dn_maxblkid, ==, 0);
+	ASSERT3U(dn->dn_allocated_txg, ==, 0);
+	ASSERT3U(dn->dn_assigned_txg, ==, 0);
+	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+		ASSERT3U(dn->dn_dirtyblksz[i], ==, 0);
+		ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
+		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+	}
+
+	dn->dn_type = ot;
+	dnode_setdblksz(dn, blocksize);
+	dn->dn_indblkshift = ibs;
+	dn->dn_nlevels = 1;
+	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	dn->dn_dirtyctx = 0;
+
+	dn->dn_free_txg = 0;
+	if (dn->dn_dirtyctx_firstset) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	dnode_setdirty(dn, tx);
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = NULL;
+
+	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(tx->tx_txg != 0);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+	ASSERT(dn->dn_dirtyblksz[0] == 0);
+	ASSERT(dn->dn_dirtyblksz[1] == 0);
+	ASSERT(dn->dn_dirtyblksz[2] == 0);
+	ASSERT(dn->dn_dirtyblksz[3] == 0);
+
+	/*
+	 * XXX I should really have a generation number to tell if we
+	 * need to do this...
+	 */
+	if (blocksize != dn->dn_datablksz ||
+	    dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+		/* free all old data */
+		dnode_free_range(dn, 0, -1ULL, tx);
+	}
+
+	/* change blocksize */
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_setdblksz(dn, blocksize);
+	dnode_setdirty(dn, tx);
+	/* don't need dd_dirty_mtx, dnode is already dirty */
+	ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0);
+	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize;
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/* change type */
+	dn->dn_type = ot;
+
+	if (dn->dn_bonuslen != bonuslen) {
+		/* change bonus size */
+		if (bonuslen == 0)
+			bonuslen = 1; /* XXX */
+		db = dbuf_hold_bonus(dn, FTAG);
+		dbuf_read(db);
+		mutex_enter(&db->db_mtx);
+		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT(db->db.db_data != NULL);
+		db->db.db_size = bonuslen;
+		mutex_exit(&db->db_mtx);
+		dbuf_dirty(db, tx);
+	}
+
+	/* change bonus size and type */
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	if (db)
+		dbuf_remove_ref(db, FTAG);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+	dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+	dnode_t *dn = dnode_create(os, dnp, NULL, object);
+	dnode_verify(dn);
+	return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+	dnode_t **children_dnodes = arg;
+	int i;
+	int epb = db->db_size >> DNODE_SHIFT;
+
+	for (i = 0; i < epb; i++) {
+		dnode_t *dn = children_dnodes[i];
+		int n;
+
+		if (dn == NULL)
+			continue;
+#ifdef ZFS_DEBUG
+		/*
+		 * If there are holds on this dnode, then there should
+		 * be holds on the dnode's containing dbuf as well; thus
+		 * it wouldn't be eligable for eviction and this function
+		 * would not have been called.
+		 */
+		ASSERT(refcount_is_zero(&dn->dn_holds));
+		ASSERT(list_head(&dn->dn_dbufs) == NULL);
+		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+		for (n = 0; n < TXG_SIZE; n++)
+			ASSERT(dn->dn_dirtyblksz[n] == 0);
+#endif
+		children_dnodes[i] = NULL;
+		dnode_destroy(dn);
+	}
+	kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * Returns held dnode if the object number is valid, NULL if not.
+ * Note that this will succeed even for free dnodes.
+ */
+dnode_t *
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+{
+	int epb, idx;
+	int drop_struct_lock = FALSE;
+	uint64_t blk;
+	dnode_t *mdn, *dn;
+	dmu_buf_impl_t *db;
+	dnode_t **children_dnodes;
+
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return (NULL);
+
+	mdn = os->os_meta_dnode;
+
+	dnode_verify(mdn);
+
+	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+	db = dbuf_hold(mdn, blk);
+	if (drop_struct_lock)
+		rw_exit(&mdn->dn_struct_rwlock);
+	dbuf_read(db);
+
+	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+	epb = db->db.db_size >> DNODE_SHIFT;
+
+	idx = object & (epb-1);
+
+	children_dnodes = dmu_buf_get_user(&db->db);
+	if (children_dnodes == NULL) {
+		dnode_t **winner;
+		children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+		    KM_SLEEP);
+		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+		    dnode_buf_pageout)) {
+			kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+			children_dnodes = winner;
+		}
+	}
+
+	if ((dn = children_dnodes[idx]) == NULL) {
+		dnode_t *winner;
+		dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
+			db, object);
+		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+		if (winner != NULL) {
+			dnode_destroy(dn);
+			dn = winner;
+		}
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_free_txg ||
+	    ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
+	    ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+		mutex_exit(&dn->dn_mtx);
+		dbuf_rele(db);
+		return (NULL);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (refcount_add(&dn->dn_holds, ref) == 1)
+		dbuf_add_ref(db, dn);
+
+	dnode_verify(dn);
+	ASSERT3P(dn->dn_dbuf, ==, db);
+	ASSERT3U(dn->dn_object, ==, object);
+	dbuf_rele(db);
+
+	return (dn);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+dnode_t *
+dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+}
+
+void
+dnode_add_ref(dnode_t *dn, void *ref)
+{
+	ASSERT(refcount_count(&dn->dn_holds) > 0);
+	(void) refcount_add(&dn->dn_holds, ref);
+}
+
+void
+dnode_rele(dnode_t *dn, void *ref)
+{
+	uint64_t refs;
+
+	refs = refcount_remove(&dn->dn_holds, ref);
+	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+	if (refs == 0 && dn->dn_dbuf)
+		dbuf_remove_ref(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t txg = tx->tx_txg;
+
+	if (IS_DNODE_DNODE(dn->dn_object))
+		return;
+
+	dnode_verify(dn);
+
+#ifdef ZFS_DEBUG
+	mutex_enter(&dn->dn_mtx);
+	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+	/* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+	mutex_exit(&dn->dn_mtx);
+#endif
+
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * If we are already marked dirty, we're done.
+	 */
+	if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) {
+		mutex_exit(&os->os_lock);
+		return;
+	}
+
+	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+	ASSERT(dn->dn_datablksz != 0);
+	dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz;
+
+	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+	    dn->dn_object, txg);
+
+	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+	} else {
+		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+	}
+
+	mutex_exit(&os->os_lock);
+
+	/*
+	 * The dnode maintains a hold on its containing dbuf as
+	 * long as there are holds on it.  Each instantiated child
+	 * dbuf maintaines a hold on the dnode.  When the last child
+	 * drops its hold, the dnode will drop its hold on the
+	 * containing dbuf. We add a "dirty hold" here so that the
+	 * dnode will hang around after we finish processing its
+	 * children.
+	 */
+	(void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+
+	dbuf_dirty(dn->dn_dbuf, tx);
+
+	dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+	/* we should be the only holder... hopefully */
+	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+	dn->dn_free_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If the dnode is already dirty, it needs to be moved from
+	 * the dirty list to the free list.
+	 */
+	mutex_enter(&dn->dn_objset->os_lock);
+	if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) {
+		list_remove(
+		    &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn);
+		list_insert_tail(
+		    &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn);
+		mutex_exit(&dn->dn_objset->os_lock);
+	} else {
+		mutex_exit(&dn->dn_objset->os_lock);
+		dnode_setdirty(dn, tx);
+	}
+}
+
+/*
+ * Try to change the block size for the indicated dnode.  This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db, *db_next;
+	int have_db0 = FALSE;
+	int err = ENOTSUP;
+
+	if (size == 0)
+		size = SPA_MINBLOCKSIZE;
+	if (size > SPA_MAXBLOCKSIZE)
+		size = SPA_MAXBLOCKSIZE;
+	else
+		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = dn->dn_indblkshift;
+
+	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec &&
+	    ibs == dn->dn_indblkshift)
+		return (0);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* Check for any allocated blocks beyond the first */
+	if (dn->dn_phys->dn_maxblkid != 0)
+		goto end;
+
+	/*
+	 * Any buffers allocated for blocks beyond the first
+	 * must be evictable/evicted, because they're the wrong size.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	/*
+	 * Since we have the dn_dbufs_mtx, nothing can be
+	 * removed from dn_dbufs.  Since we have dn_struct_rwlock/w,
+	 * nothing can be added to dn_dbufs.
+	 */
+	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+		db_next = list_next(&dn->dn_dbufs, db);
+
+		if (db->db_blkid == 0) {
+			have_db0 = TRUE;
+		} else if (db->db_blkid != DB_BONUS_BLKID) {
+			mutex_exit(&dn->dn_dbufs_mtx);
+			goto end;
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	/* Fast-track if there is no data in the file */
+	if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) {
+		dnode_setdblksz(dn, size);
+		dn->dn_indblkshift = ibs;
+		dnode_setdirty(dn, tx);
+		/* don't need dd_dirty_mtx, dnode is already dirty */
+		dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+		rw_exit(&dn->dn_struct_rwlock);
+		return (0);
+	}
+
+	/* obtain the old block */
+	db = dbuf_hold(dn, 0);
+
+	/* Not allowed to decrease the size if there is data present */
+	if (size < db->db.db_size) {
+		dbuf_rele(db);
+		goto end;
+	}
+
+	dbuf_new_size(db, size, tx);
+
+	dnode_setdblksz(dn, size);
+	dn->dn_indblkshift = ibs;
+	/* don't need dd_dirty_mtx, dnode is already dirty */
+	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+	dbuf_rele(db);
+
+	err = 0;
+end:
+	rw_exit(&dn->dn_struct_rwlock);
+	return (err);
+}
+
+uint64_t
+dnode_max_nonzero_offset(dnode_t *dn)
+{
+	if (dn->dn_phys->dn_maxblkid == 0 &&
+	    BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]))
+		return (0);
+	else
+		return ((dn->dn_phys->dn_maxblkid+1) * dn->dn_datablksz);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+	uint64_t txgoff = tx->tx_txg & TXG_MASK;
+	int drop_struct_lock = FALSE;
+	int epbs, old_nlevels, new_nlevels;
+	uint64_t sz;
+
+	if (blkid == DB_BONUS_BLKID)
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (blkid > dn->dn_maxblkid)
+		dn->dn_maxblkid = blkid;
+
+	/*
+	 * Compute the number of levels necessary to support the
+	 * new blkid.
+	 */
+	new_nlevels = 1;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr;
+	    sz <<= epbs)
+		new_nlevels++;
+	old_nlevels = dn->dn_nlevels;
+
+	if (new_nlevels > dn->dn_next_nlevels[txgoff])
+		dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+	if (new_nlevels > old_nlevels) {
+		dprintf("dn %p increasing nlevels from %u to %u\n",
+		    dn, dn->dn_nlevels, new_nlevels);
+		dn->dn_nlevels = new_nlevels;
+	}
+
+	/*
+	 * Dirty the left indirects.
+	 * Note: the caller should have just dnode_use_space()'d one
+	 * data block's worth, so we could subtract that out of
+	 * dn_inflight_data to determine if there is any dirty data
+	 * besides this block.
+	 * We don't strictly need to dirty them unless there's
+	 * *something* in the object (eg. on disk or dirty)...
+	 */
+	if (new_nlevels > old_nlevels) {
+		dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+		dprintf("dn %p dirtying left indirects\n", dn);
+		dbuf_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+#ifdef ZFS_DEBUG
+	else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
+		dmu_buf_impl_t *db;
+		int i;
+
+		for (i = 0; i < dn->dn_nblkptr; i++) {
+			db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
+			ASSERT(!
+			    list_link_active(&db->db_dirty_node[txgoff]));
+			dbuf_remove_ref(db, FTAG);
+		}
+	}
+#endif
+
+	dprintf("dn %p done\n", dn);
+
+out:
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+	avl_index_t where;
+	free_range_t *rp;
+	free_range_t rp_tofind;
+	uint64_t endblk = blkid + nblks;
+
+	ASSERT(MUTEX_HELD(&dn->dn_mtx));
+	ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
+	rp_tofind.fr_blkid = blkid;
+	rp = avl_find(tree, &rp_tofind, &where);
+	if (rp == NULL)
+		rp = avl_nearest(tree, where, AVL_BEFORE);
+	if (rp == NULL)
+		rp = avl_nearest(tree, where, AVL_AFTER);
+
+	while (rp && (rp->fr_blkid <= blkid + nblks)) {
+		uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+		free_range_t *nrp = AVL_NEXT(tree, rp);
+
+		if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+			/* clear this entire range */
+			avl_remove(tree, rp);
+			kmem_free(rp, sizeof (free_range_t));
+		} else if (blkid <= rp->fr_blkid &&
+		    endblk > rp->fr_blkid && endblk < fr_endblk) {
+			/* clear the beginning of this range */
+			rp->fr_blkid = endblk;
+			rp->fr_nblks = fr_endblk - endblk;
+		} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+		    endblk >= fr_endblk) {
+			/* clear the end of this range */
+			rp->fr_nblks = blkid - rp->fr_blkid;
+		} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+			/* clear a chunk out of this range */
+			free_range_t *new_rp =
+			    kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+			new_rp->fr_blkid = endblk;
+			new_rp->fr_nblks = fr_endblk - endblk;
+			avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+			rp->fr_nblks = blkid - rp->fr_blkid;
+		}
+		/* there may be no overlap */
+		rp = nrp;
+	}
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	uint64_t start, objsize, blkid, nblks;
+	int blkshift, blksz, tail, head, epbs;
+	int trunc = FALSE;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	/* If the range is past the end of the file, this is a no-op */
+	objsize = blksz * (dn->dn_maxblkid+1);
+	if (off >= objsize)
+		goto out;
+	if (len == -1ULL) {
+		len = UINT64_MAX - off;
+		trunc = TRUE;
+	}
+
+	/*
+	 * First, block align the region to free:
+	 */
+	if (dn->dn_maxblkid == 0) {
+		if (off == 0) {
+			head = 0;
+		} else {
+			head = blksz - off;
+			ASSERT3U(head, >, 0);
+		}
+		start = off;
+	} else {
+		ASSERT(ISP2(blksz));
+		head = P2NPHASE(off, blksz);
+		start = P2PHASE(off, blksz);
+	}
+	/* zero out any partial block data at the start of the range */
+	if (head) {
+		ASSERT3U(start + head, ==, blksz);
+		if (len < head)
+			head = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+		    FTAG, &db) == 0) {
+			caddr_t data;
+
+			/* don't dirty if it isn't on disk and isn't dirty */
+			if (db->db_dirtied ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				data = db->db.db_data;
+				bzero(data + start, head);
+			}
+			dbuf_remove_ref(db, FTAG);
+		}
+		off += head;
+		len -= head;
+	}
+	/* If the range was less than one block, we are done */
+	if (len == 0)
+		goto out;
+
+	/* If the remaining range is past the end of the file, we are done */
+	if (off > dn->dn_maxblkid << blkshift)
+		goto out;
+
+	if (off + len == UINT64_MAX)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT3U(P2PHASE(off, blksz), ==, 0);
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		if (len < tail)
+			tail = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+		    TRUE, FTAG, &db) == 0) {
+			/* don't dirty if it isn't on disk and isn't dirty */
+			if (db->db_dirtied ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				bzero(db->db.db_data, tail);
+			}
+			dbuf_remove_ref(db, FTAG);
+		}
+		len -= tail;
+	}
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
+		goto out;
+
+	/* dirty the left indirects */
+	if (dn->dn_nlevels > 1 && off != 0) {
+		db = dbuf_hold_level(dn, 1,
+		    (off - head) >> (blkshift + epbs), FTAG);
+		dbuf_will_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+
+	/* dirty the right indirects */
+	if (dn->dn_nlevels > 1 && !trunc) {
+		db = dbuf_hold_level(dn, 1,
+		    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+		dbuf_will_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+
+	/*
+	 * Finally, add this range to the dnode range list, we
+	 * will finish up this free operation in the syncing phase.
+	 */
+	ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+	ASSERT(off + len == UINT64_MAX || IS_P2ALIGNED(len, 1<<blkshift));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+
+	if (trunc)
+		dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+
+	mutex_enter(&dn->dn_mtx);
+	dnode_clear_range(dn, blkid, nblks, tx);
+	{
+		free_range_t *rp, *found;
+		avl_index_t where;
+		avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+		/* Add new range to dn_ranges */
+		rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+		rp->fr_blkid = blkid;
+		rp->fr_nblks = nblks;
+		found = avl_find(tree, rp, &where);
+		ASSERT(found == NULL);
+		avl_insert(tree, rp, where);
+		dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+		    blkid, nblks, tx->tx_txg);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	dbuf_free_range(dn, blkid, nblks, tx);
+	dnode_setdirty(dn, tx);
+out:
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+	free_range_t range_tofind;
+	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+	int i;
+
+	if (blkid == DB_BONUS_BLKID)
+		return (FALSE);
+
+	/*
+	 * If we're in the process of opening the pool, dp will not be
+	 * set yet, but there shouldn't be anything dirty.
+	 */
+	if (dp == NULL)
+		return (FALSE);
+
+	if (dn->dn_free_txg)
+		return (TRUE);
+
+	/*
+	 * If dn_datablkshift is not set, then there's only a single
+	 * block, in which case there will never be a free range so it
+	 * won't matter.
+	 */
+	range_tofind.fr_blkid = blkid;
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		free_range_t *range_found;
+		avl_index_t idx;
+
+		range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+		if (range_found) {
+			ASSERT(range_found->fr_nblks > 0);
+			break;
+		}
+		range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+		if (range_found &&
+		    range_found->fr_blkid + range_found->fr_nblks > blkid)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t space)
+{
+	uint64_t sectors;
+
+	dprintf_dnode(dn, "dn=%p dnp=%p secphys=%llu space=%lld\n",
+	    dn, dn->dn_phys,
+	    (u_longlong_t)dn->dn_phys->dn_secphys,
+	    (longlong_t)space);
+
+	ASSERT(P2PHASE(space, 1<<DEV_BSHIFT) == 0);
+
+	mutex_enter(&dn->dn_mtx);
+	if (space > 0) {
+		sectors = space >> DEV_BSHIFT;
+		ASSERT3U(dn->dn_phys->dn_secphys + sectors, >=,
+		    dn->dn_phys->dn_secphys);
+		dn->dn_phys->dn_secphys += sectors;
+	} else {
+		sectors = -space >> DEV_BSHIFT;
+		ASSERT3U(dn->dn_phys->dn_secphys, >=, sectors);
+		dn->dn_phys->dn_secphys -= sectors;
+	}
+	mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+
+	if (space > 0)
+		space = spa_get_asize(os->os_spa, space);
+
+	if (ds)
+		dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+	dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+	int lvl, uint64_t blkfill)
+{
+	dmu_buf_impl_t *db = NULL;
+	void *data = NULL;
+	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t epb = 1ULL << epbs;
+	uint64_t minfill, maxfill;
+	int i, error, span;
+
+	dprintf("probing object %llu offset %llx level %d of %u\n",
+	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+	if (lvl == dn->dn_phys->dn_nlevels) {
+		error = 0;
+		epb = dn->dn_phys->dn_nblkptr;
+		data = dn->dn_phys->dn_blkptr;
+	} else {
+		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		if (error) {
+			if (error == ENOENT)
+				return (hole ? 0 : ESRCH);
+			return (error);
+		}
+		dbuf_read_havestruct(db);
+		data = db->db.db_data;
+	}
+
+	if (lvl == 0) {
+		dnode_phys_t *dnp = data;
+		span = DNODE_SHIFT;
+		ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+		for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+			if (!dnp[i].dn_type == hole)
+				break;
+			*offset += 1ULL << span;
+		}
+		if (i == blkfill)
+			error = ESRCH;
+	} else {
+		blkptr_t *bp = data;
+		span = (lvl - 1) * epbs + dn->dn_datablkshift;
+		minfill = 0;
+		maxfill = blkfill << ((lvl - 1) * epbs);
+
+		if (hole)
+			maxfill--;
+		else
+			minfill++;
+
+		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+		    i < epb; i++) {
+			if (bp[i].blk_fill >= minfill &&
+			    bp[i].blk_fill <= maxfill)
+				break;
+			*offset += 1ULL << span;
+		}
+		if (i >= epb)
+			error = ESRCH;
+	}
+
+	if (db)
+		dbuf_remove_ref(db, FTAG);
+
+	return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1);
+ *	Finds the next hole/data in a file.
+ *	Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK);
+ *	Finds the next free/allocated dnode an objset's meta-dnode.
+ *	Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2);
+ *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ *	Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+    int minlvl, uint64_t blkfill)
+{
+	int lvl, maxlvl;
+	int error = 0;
+	uint64_t initial_offset = *offset;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	if (dn->dn_phys->dn_nlevels == 0) {
+		rw_exit(&dn->dn_struct_rwlock);
+		return (ESRCH);
+	}
+
+	if (dn->dn_datablkshift == 0) {
+		if (*offset < dn->dn_datablksz) {
+			if (hole)
+				*offset = dn->dn_datablksz;
+		} else {
+			error = ESRCH;
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		return (error);
+	}
+
+	maxlvl = dn->dn_phys->dn_nlevels;
+
+	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+		error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+		if (error == 0)
+			break;
+	}
+
+	while (--lvl >= minlvl && error == 0)
+		error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (initial_offset > *offset)
+		return (ESRCH);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 0000000000..56fc3e19ae
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int i;
+	uint64_t txg = tx->tx_txg;
+
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	/* this dnode can't be paged out because it's dirty */
+
+	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
+		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+			break;
+	if (i != dn->dn_phys->dn_nblkptr) {
+		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
+
+		dbuf_read_havestruct(db);
+		arc_release(db->db_buf, db);
+		/* copy dnode's block pointers to new indirect block */
+		ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
+		    db->db.db_size);
+		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+		    sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+	}
+
+	dn->dn_phys->dn_nlevels += 1;
+	dprintf("os=%p obj=%llu, increase to %d\n",
+		dn->dn_objset, dn->dn_object,
+		dn->dn_phys->dn_nlevels);
+
+	/* set dbuf's parent pointers to new indirect buf */
+	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) {
+		dmu_buf_impl_t *child =
+		    dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i);
+		if (child == NULL)
+			continue;
+		if (child->db_dnode == NULL) {
+			mutex_exit(&child->db_mtx);
+			continue;
+		}
+
+		if (child->db_parent == NULL ||
+		    child->db_parent == dn->dn_dbuf) {
+			dprintf_dbuf_bp(child, child->db_blkptr,
+			    "changing db_blkptr to new indirect %s", "");
+			child->db_parent = db;
+			dbuf_add_ref(db, child);
+			if (db->db.db_data) {
+				child->db_blkptr =
+				    (blkptr_t *)db->db.db_data + i;
+			} else {
+				child->db_blkptr = NULL;
+			}
+			dprintf_dbuf_bp(child, child->db_blkptr,
+			    "changed db_blkptr to new indirect %s", "");
+		}
+		ASSERT3P(child->db_parent, ==, db);
+
+		mutex_exit(&child->db_mtx);
+	}
+
+	bzero(dn->dn_phys->dn_blkptr,
+		sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+
+	dbuf_remove_ref(db, FTAG);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t bytesfreed = 0;
+	int i;
+
+	dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+	for (i = 0; i < num; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+
+		bytesfreed += BP_GET_ASIZE(bp);
+		ASSERT3U(bytesfreed >> DEV_BSHIFT, <=, dn->dn_phys->dn_secphys);
+		dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx);
+	}
+	dnode_diduse_space(dn, -bytesfreed);
+}
+
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+#ifdef ZFS_DEBUG
+	int off, num;
+	int i, err, epbs;
+	uint64_t txg = tx->tx_txg;
+
+	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	off = start - (db->db_blkid * 1<<epbs);
+	num = end - start + 1;
+
+	ASSERT3U(off, >=, 0);
+	ASSERT3U(num, >=, 0);
+	ASSERT3U(db->db_level, >, 0);
+	ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+	ASSERT(db->db_blkptr != NULL);
+
+	for (i = off; i < off+num; i++) {
+		uint64_t *buf;
+		int j;
+		dmu_buf_impl_t *child;
+
+		ASSERT(db->db_level == 1);
+
+		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+			(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		rw_exit(&db->db_dnode->dn_struct_rwlock);
+		if (err == ENOENT)
+			continue;
+		ASSERT(err == 0);
+		ASSERT(child->db_level == 0);
+		ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK]));
+
+		/* db_data_old better be zeroed */
+		if (child->db_d.db_data_old[txg & TXG_MASK]) {
+			buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    child, i, off, num);
+				}
+			}
+		}
+
+		/*
+		 * db_data better be zeroed unless it's dirty in a
+		 * future txg.
+		 */
+		mutex_enter(&child->db_mtx);
+		buf = child->db.db_data;
+		if (buf != NULL && child->db_state != DB_FILL &&
+		    !list_link_active(&child->db_dirty_node
+			[(txg+1) & TXG_MASK]) &&
+		    !list_link_active(&child->db_dirty_node
+			[(txg+2) & TXG_MASK])) {
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    child, i, off, num);
+				}
+			}
+		}
+		mutex_exit(&child->db_mtx);
+
+		dbuf_remove_ref(child, FTAG);
+	}
+#endif
+}
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	blkptr_t *bp;
+	dmu_buf_impl_t *subdb;
+	uint64_t start, end, dbstart, dbend, i;
+	int epbs, shift, err;
+	int txg_index = tx->tx_txg&TXG_MASK;
+	int all = TRUE;
+
+	dbuf_read(db);
+	arc_release(db->db_buf, db);
+	bp = (blkptr_t *)db->db.db_data;
+
+	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	shift = (db->db_level - 1) * epbs;
+	dbstart = db->db_blkid << epbs;
+	start = blkid >> shift;
+	if (dbstart < start) {
+		bp += start - dbstart;
+		all = FALSE;
+	} else {
+		start = dbstart;
+	}
+	dbend = ((db->db_blkid + 1) << epbs) - 1;
+	end = (blkid + nblks - 1) >> shift;
+	if (dbend <= end)
+		end = dbend;
+	else if (all)
+		all = trunc;
+	ASSERT3U(start, <=, end);
+
+	if (db->db_level == 1) {
+		free_verify(db, start, end, tx);
+		free_blocks(dn, bp, end-start+1, tx);
+		ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+		return (all);
+	}
+
+	for (i = start; i <= end; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+		ASSERT3U(err, ==, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+
+		if (free_children(subdb, blkid, nblks, trunc, tx)) {
+			ASSERT3P(subdb->db_blkptr, ==, bp);
+			free_blocks(dn, bp, 1, tx);
+		}
+		dbuf_remove_ref(subdb, FTAG);
+	}
+#ifdef ZFS_DEBUG
+	bp -= (end-start)+1;
+	for (i = start; i <= end; i++, bp++) {
+		if (i == start && blkid != 0)
+			continue;
+		else if (i == end && !trunc)
+			continue;
+		ASSERT3U(bp->blk_birth, ==, 0);
+	}
+#endif
+	ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+	return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	blkptr_t *bp = dn->dn_phys->dn_blkptr;
+	dmu_buf_impl_t *db;
+	int trunc, start, end, shift, i, err;
+	int dnlevel = dn->dn_phys->dn_nlevels;
+
+	if (blkid > dn->dn_phys->dn_maxblkid)
+		return;
+
+	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+	trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+	if (trunc)
+		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+	/* There are no indirect blocks in the object */
+	if (dnlevel == 1) {
+		if (blkid >= dn->dn_phys->dn_nblkptr) {
+			/* this range was never made persistent */
+			return;
+		}
+		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+		free_blocks(dn, bp + blkid, nblks, tx);
+		if (trunc) {
+			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+			ASSERT(off < dn->dn_phys->dn_maxblkid ||
+			    dn->dn_phys->dn_maxblkid == 0 ||
+			    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+		}
+		return;
+	}
+
+	shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+	start = blkid >> shift;
+	ASSERT(start < dn->dn_phys->dn_nblkptr);
+	end = (blkid + nblks - 1) >> shift;
+	bp += start;
+	for (i = start; i <= end; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+		ASSERT3U(err, ==, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+
+		if (free_children(db, blkid, nblks, trunc, tx)) {
+			ASSERT3P(db->db_blkptr, ==, bp);
+			free_blocks(dn, bp, 1, tx);
+		}
+		dbuf_remove_ref(db, FTAG);
+	}
+	if (trunc) {
+		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+	}
+}
+
+static int
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/* Undirty all buffers */
+	while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) {
+		mutex_enter(&db->db_mtx);
+		/* XXX - use dbuf_undirty()? */
+		list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+		if (db->db_level == 0) {
+			ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+			if (db->db_d.db_overridden_by[txgoff])
+				dbuf_unoverride(db, tx->tx_txg);
+			db->db_d.db_data_old[txgoff] = NULL;
+		}
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	}
+
+	ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+
+	/* Undirty next bits */
+	dn->dn_next_nlevels[txgoff] = 0;
+	dn->dn_next_indblkshift[txgoff] = 0;
+
+	/* free up all the blocks in the file. */
+	dbuf_free_range(dn, 0, -1, tx);
+	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+	ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
+
+	/*
+	 * All dbufs should be gone, since all holds are gone...
+	 */
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	/* ASSERT(blkptrs are zero); */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	ASSERT(dn->dn_free_txg > 0);
+	if (dn->dn_allocated_txg != dn->dn_free_txg)
+		dbuf_will_dirty(dn->dn_dbuf, tx);
+	bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_type = DMU_OT_NONE;
+	dn->dn_dirtyblksz[txgoff] = 0;
+	dn->dn_maxblkid = 0;
+	dn->dn_allocated_txg = 0;
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+
+	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	/*
+	 * Now that we've released our hold, the dnode may
+	 * be evicted, so we musn't access it.
+	 */
+	return (1);
+}
+
+/*
+ * Write out the dnode's dirty buffers at the specified level.
+ * This may create more dirty buffers at the next level up.
+ *
+ * NOTE: The dnode is kept in memory by being dirty.  Once the
+ * dirty bit is cleared, it may be evicted.  Beware of this!
+ */
+int
+dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
+{
+	free_range_t *rp;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	dnode_phys_t *dnp = dn->dn_phys;
+
+	/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+	    dn->dn_dirtyblksz[txgoff] > 0);
+
+	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	dnode_verify(dn);
+	/*
+	 * Make sure the dbuf for the dn_phys is released before we modify it.
+	 */
+	if (dn->dn_dbuf)
+		arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf);
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_allocated_txg == tx->tx_txg) {
+		/* The dnode is newly allocated or reallocated */
+		if (dnp->dn_type == DMU_OT_NONE) {
+			/* this is a first alloc, not a realloc */
+			/* XXX shouldn't the phys already be zeroed? */
+			bzero(dnp, DNODE_CORE_SIZE);
+			dnp->dn_datablkszsec = dn->dn_datablkszsec;
+			dnp->dn_indblkshift = dn->dn_indblkshift;
+			dnp->dn_nlevels = 1;
+		}
+
+		if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+			/* zero the new blkptrs we are gaining */
+			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			    sizeof (blkptr_t) *
+			    (dn->dn_nblkptr - dnp->dn_nblkptr));
+		}
+		dnp->dn_type = dn->dn_type;
+		dnp->dn_bonustype = dn->dn_bonustype;
+		dnp->dn_bonuslen = dn->dn_bonuslen;
+		dnp->dn_nblkptr = dn->dn_nblkptr;
+	}
+
+	if (dn->dn_dirtyblksz[txgoff]) {
+		ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff],
+		    SPA_MINBLOCKSIZE) == 0);
+		dnp->dn_datablkszsec =
+		    dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT;
+	}
+
+	if (dn->dn_next_indblkshift[txgoff]) {
+		ASSERT(dnp->dn_nlevels == 1);
+		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+		dn->dn_next_indblkshift[txgoff] = 0;
+	}
+
+	/*
+	 * Just take the live (open-context) values for checksum and compress.
+	 * Strictly speaking it's a future leak, but nothing bad happens if we
+	 * start using the new checksum or compress algorithm a little early.
+	 */
+	dnp->dn_checksum = dn->dn_checksum;
+	dnp->dn_compress = dn->dn_compress;
+
+	mutex_exit(&dn->dn_mtx);
+
+	/* process all the "freed" ranges in the file */
+	if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+		for (rp = avl_first(&dn->dn_ranges[txgoff]); rp != NULL;
+		    rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp))
+			dnode_sync_free_range(dn,
+			    rp->fr_blkid, rp->fr_nblks, tx);
+	}
+	mutex_enter(&dn->dn_mtx);
+	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+		free_range_t *last = rp;
+		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+		avl_remove(&dn->dn_ranges[txgoff], last);
+		kmem_free(last, sizeof (free_range_t));
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+		ASSERT3U(level, ==, 0);
+		return (dnode_sync_free(dn, tx));
+	}
+
+	if (dn->dn_next_nlevels[txgoff]) {
+		int new_lvl = dn->dn_next_nlevels[txgoff];
+
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		while (new_lvl > dnp->dn_nlevels)
+			dnode_increase_indirection(dn, tx);
+		rw_exit(&dn->dn_struct_rwlock);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
+	if (level == dnp->dn_nlevels) {
+		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+		/* we've already synced out all data and indirect blocks */
+		/* there are no more dirty dbufs under this dnode */
+		ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL);
+		ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg);
+
+		/* XXX this is expensive. remove once 6343073 is closed. */
+		/* NB: the "off < maxblkid" is to catch overflow */
+		/*
+		 * NB: if blocksize is changing, we could get confused,
+		 * so only bother if there are multiple blocks and thus
+		 * it can't be changing.
+		 */
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+
+		dn->dn_dirtyblksz[txgoff] = 0;
+
+
+		if (!IS_DNODE_DNODE(dn->dn_object)) {
+			dbuf_will_dirty(dn->dn_dbuf, tx);
+			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+		}
+
+		/*
+		 * Now that we've dropped the reference, the dnode may
+		 * be evicted, so we musn't access it.
+		 */
+		return (1);
+	} else {
+		dmu_buf_impl_t *db, *db_next;
+		list_t *list = &dn->dn_dirty_dbufs[txgoff];
+		/*
+		 * Iterate over the list, removing and sync'ing dbufs
+		 * which are on the level we want, and leaving others.
+		 */
+		for (db = list_head(list); db; db = db_next) {
+			db_next = list_next(list, db);
+			if (db->db_level == level) {
+				list_remove(list, db);
+				dbuf_sync(db, zio, tx);
+			}
+		}
+		return (0);
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 0000000000..ab8dcfc3e3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,1463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+
+#define	DOS_REF_MAX	(1ULL << 62)
+
+#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
+
+#define	BP_GET_UCSIZE(bp) \
+	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DOS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
+ * weight (DOS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+	0,			/* DOS_MODE_NONE - invalid		*/
+	1,			/* DOS_MODE_STANDARD - unlimited number	*/
+	(DOS_REF_MAX >> 1) + 1,	/* DOS_MODE_PRIMARY - only one of these	*/
+	DOS_REF_MAX		/* DOS_MODE_EXCLUSIVE - no other opens	*/
+};
+
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	int used = BP_GET_ASIZE(bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	dprintf_bp(bp, "born, ds=%p\n", ds);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* It could have been compressed away to nothing */
+	if (BP_IS_HOLE(bp))
+		return;
+	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+	if (ds == NULL) {
+		/*
+		 * Account for the meta-objset space in its placeholder
+		 * dsl_dir.
+		 */
+		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		    used, compressed, uncompressed, tx);
+		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		return;
+	}
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_lock);
+	ds->ds_phys->ds_used_bytes += used;
+	ds->ds_phys->ds_compressed_bytes += compressed;
+	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+	ds->ds_phys->ds_unique_bytes += used;
+	mutex_exit(&ds->ds_lock);
+	dsl_dir_diduse_space(ds->ds_dir,
+	    used, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	int used = BP_GET_ASIZE(bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	if (BP_IS_HOLE(bp))
+		return;
+
+	ASSERT(used > 0);
+	if (ds == NULL) {
+		/*
+		 * Account for the meta-objset space in its placeholder
+		 * dataset.
+		 */
+		/* XXX this can fail, what do we do when it does? */
+		(void) arc_free(NULL, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+		bzero(bp, sizeof (blkptr_t));
+
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		    -used, -compressed, -uncompressed, tx);
+		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		return;
+	}
+	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+		dprintf_bp(bp, "freeing: %s", "");
+		/* XXX check return code? */
+		(void) arc_free(NULL, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+
+		mutex_enter(&ds->ds_lock);
+		/* XXX unique_bytes is not accurate for head datasets */
+		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+		ds->ds_phys->ds_unique_bytes -= used;
+		mutex_exit(&ds->ds_lock);
+		dsl_dir_diduse_space(ds->ds_dir,
+		    -used, -compressed, -uncompressed, tx);
+	} else {
+		dprintf_bp(bp, "putting on dead list: %s", "");
+		bplist_enqueue(&ds->ds_deadlist, bp, tx);
+		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		if (ds->ds_phys->ds_prev_snap_obj != 0) {
+			ASSERT3U(ds->ds_prev->ds_object, ==,
+			    ds->ds_phys->ds_prev_snap_obj);
+			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+			    ds->ds_object &&
+			    bp->blk_birth >
+			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+				mutex_enter(&ds->ds_prev->ds_lock);
+				ds->ds_prev->ds_phys->ds_unique_bytes +=
+				    used;
+				mutex_exit(&ds->ds_prev->ds_lock);
+			}
+		}
+	}
+	bzero(bp, sizeof (blkptr_t));
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+	ds->ds_phys->ds_used_bytes -= used;
+	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+	ds->ds_phys->ds_compressed_bytes -= compressed;
+	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+	mutex_exit(&ds->ds_lock);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+{
+	uint64_t prev_snap_txg;
+	dsl_dir_t *dd;
+	/* ASSERT that it is not a snapshot */
+	if (ds == NULL)
+		return (TRUE);
+	/*
+	 * The snapshot creation could fail, but that would cause an
+	 * incorrect FALSE return, which would only result in an
+	 * overestimation of the amount of space that an operation would
+	 * consume, which is OK.
+	 *
+	 * There's also a small window where we could miss a pending
+	 * snapshot, because we could set the sync task in the quiescing
+	 * phase.  So this should only be used as a guess.
+	 */
+	dd = ds->ds_dir;
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
+	    dd->dd_sync_txg < tx->tx_txg)
+		prev_snap_txg = dd->dd_sync_txg;
+	else
+		prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	mutex_exit(&dd->dd_lock);
+	return (blk_birth > prev_snap_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+	dsl_dataset_t *ds = dsv;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* open_refcount == DOS_REF_MAX when deleting */
+	ASSERT(ds->ds_open_refcount == 0 ||
+	    ds->ds_open_refcount == DOS_REF_MAX);
+
+	dprintf_ds(ds, "evicting %s\n", "");
+
+	unique_remove(ds->ds_phys->ds_fsid_guid);
+
+	if (ds->ds_user_ptr != NULL)
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+	if (ds->ds_prev) {
+		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+		ds->ds_prev = NULL;
+	}
+
+	bplist_close(&ds->ds_deadlist);
+	dsl_dir_close(ds->ds_dir, ds);
+
+	if (list_link_active(&ds->ds_synced_link))
+		list_remove(&dp->dp_synced_objsets, ds);
+
+	kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static void
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+	dsl_dataset_phys_t *headphys;
+	int err;
+	dmu_buf_t *headdbuf;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (ds->ds_snapname[0])
+		return;
+	if (ds->ds_phys->ds_next_snap_obj == 0)
+		return;
+
+	headdbuf = dmu_bonus_hold_tag(mos,
+	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
+	dmu_buf_read(headdbuf);
+	headphys = headdbuf->db_data;
+	err = zap_value_search(dp->dp_meta_objset,
+	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+	ASSERT(err == 0);
+	dmu_buf_rele_tag(headdbuf, FTAG);
+}
+
+dsl_dataset_t *
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+    int mode, void *tag)
+{
+	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_t *ds;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+
+	dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
+	dmu_buf_read(dbuf);
+	ds = dmu_buf_get_user(dbuf);
+	if (ds == NULL) {
+		dsl_dataset_t *winner;
+
+		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+		ds->ds_dbuf = dbuf;
+		ds->ds_object = dsobj;
+		ds->ds_phys = dbuf->db_data;
+		ds->ds_dir = dsl_dir_open_obj(dp,
+		    ds->ds_phys->ds_dir_obj, NULL, ds);
+
+		bplist_open(&ds->ds_deadlist,
+		    mos, ds->ds_phys->ds_deadlist_obj);
+
+		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+			ds->ds_snapname[0] = '\0';
+			if (ds->ds_phys->ds_prev_snap_obj) {
+				ds->ds_prev =
+				    dsl_dataset_open_obj(dp,
+				    ds->ds_phys->ds_prev_snap_obj, NULL,
+				    DS_MODE_NONE, ds);
+			}
+		} else {
+			if (snapname) {
+#ifdef ZFS_DEBUG
+				dsl_dataset_phys_t *headphys;
+				int err;
+				dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
+				    ds->ds_dir->dd_phys->
+				    dd_head_dataset_obj, FTAG);
+				dmu_buf_read(headdbuf);
+				headphys = headdbuf->db_data;
+				uint64_t foundobj;
+				err = zap_lookup(dp->dp_meta_objset,
+				    headphys->ds_snapnames_zapobj,
+				    snapname, sizeof (foundobj), 1, &foundobj);
+				ASSERT3U(err, ==, 0);
+				ASSERT3U(foundobj, ==, dsobj);
+				dmu_buf_rele_tag(headdbuf, FTAG);
+#endif
+				(void) strcat(ds->ds_snapname, snapname);
+			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+				dsl_dataset_get_snapname(ds);
+			}
+		}
+
+		winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+		    dsl_dataset_evict);
+		if (winner) {
+			bplist_close(&ds->ds_deadlist);
+			if (ds->ds_prev) {
+				dsl_dataset_close(ds->ds_prev,
+				    DS_MODE_NONE, ds);
+			}
+			dsl_dir_close(ds->ds_dir, ds);
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			ds = winner;
+		} else {
+			uint64_t new =
+			    unique_insert(ds->ds_phys->ds_fsid_guid);
+			if (new != ds->ds_phys->ds_fsid_guid) {
+				/* XXX it won't necessarily be synced... */
+				ds->ds_phys->ds_fsid_guid = new;
+			}
+		}
+	}
+	ASSERT3P(ds->ds_dbuf, ==, dbuf);
+	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+	mutex_enter(&ds->ds_lock);
+	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+	    ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) ||
+	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, tag);
+		return (NULL);
+	}
+	ds->ds_open_refcount += weight;
+	mutex_exit(&ds->ds_lock);
+
+	return (ds);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+    void *tag, dsl_dataset_t **dsp)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	const char *tail;
+	uint64_t obj;
+	dsl_dataset_t *ds = NULL;
+	int err = 0;
+
+	dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	dp = dd->dd_pool;
+	obj = dd->dd_phys->dd_head_dataset_obj;
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	if (obj == 0) {
+		/* A dataset with no associated objset */
+		err = ENOENT;
+		goto out;
+	}
+
+	if (tail != NULL) {
+		objset_t *mos = dp->dp_meta_objset;
+
+		ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+		obj = ds->ds_phys->ds_snapnames_zapobj;
+		dsl_dataset_close(ds, DS_MODE_NONE, tag);
+		ds = NULL;
+
+		if (tail[0] != '@') {
+			err = ENOENT;
+			goto out;
+		}
+		tail++;
+
+		/* Look for a snapshot */
+		if (!DS_MODE_IS_READONLY(mode)) {
+			err = EROFS;
+			goto out;
+		}
+		dprintf("looking for snapshot '%s'\n", tail);
+		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
+		if (err)
+			goto out;
+	}
+	ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
+	if (ds == NULL)
+		err = EBUSY;
+
+out:
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dir_close(dd, FTAG);
+
+	ASSERT3U((err == 0), ==, (ds != NULL));
+	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+	*dsp = ds;
+	return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+	if (ds == NULL) {
+		(void) strcpy(name, "mos");
+	} else {
+		dsl_dir_name(ds->ds_dir, name);
+		dsl_dataset_get_snapname(ds);
+		if (ds->ds_snapname[0]) {
+			(void) strcat(name, "@");
+			if (!MUTEX_HELD(&ds->ds_lock)) {
+				/*
+				 * We use a "recursive" mutex so that we
+				 * can call dprintf_ds() with ds_lock held.
+				 */
+				mutex_enter(&ds->ds_lock);
+				(void) strcat(name, ds->ds_snapname);
+				mutex_exit(&ds->ds_lock);
+			} else {
+				(void) strcat(name, ds->ds_snapname);
+			}
+		}
+	}
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_open_refcount, >=, weight);
+	ds->ds_open_refcount -= weight;
+	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
+	    mode, ds->ds_open_refcount);
+	mutex_exit(&ds->ds_lock);
+
+	dmu_buf_rele_tag(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+	dsl_dir_t *dd;
+
+	dsl_dir_create_root(mos, ddobjp, tx);
+	dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
+	ASSERT(dd != NULL);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	dmu_buf_rele(dbuf);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_close(dd, FTAG);
+
+	ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+int
+dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+{
+	int err;
+	dsl_pool_t *dp = pds->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dir_t *dd;
+
+	if (clone_parent != NULL) {
+		/*
+		 * You can't clone across pools.
+		 */
+		if (clone_parent->ds_dir->dd_pool != dp)
+			return (EXDEV);
+
+		/*
+		 * You can only clone snapshots, not the head datasets.
+		 */
+		if (clone_parent->ds_phys->ds_num_children == 0)
+			return (EINVAL);
+	}
+
+	ASSERT(lastname[0] != '@');
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	err = dsl_dir_create_sync(pds, lastname, tx);
+	if (err)
+		return (err);
+	dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
+	ASSERT(dd != NULL);
+
+	/* This is the point of no (unsuccessful) return */
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	if (clone_parent) {
+		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+		dsphys->ds_prev_snap_txg =
+		    clone_parent->ds_phys->ds_creation_txg;
+		dsphys->ds_used_bytes =
+		    clone_parent->ds_phys->ds_used_bytes;
+		dsphys->ds_compressed_bytes =
+		    clone_parent->ds_phys->ds_compressed_bytes;
+		dsphys->ds_uncompressed_bytes =
+		    clone_parent->ds_phys->ds_uncompressed_bytes;
+		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+
+		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
+		clone_parent->ds_phys->ds_num_children++;
+
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+	}
+	dmu_buf_rele(dbuf);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_close(dd, FTAG);
+
+	return (0);
+}
+
+
+int
+dsl_dataset_destroy(const char *name)
+{
+	int err;
+	dsl_pool_t *dp;
+	dsl_dir_t *dd;
+	const char *tail;
+
+	dd = dsl_dir_open(name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	dp = dd->dd_pool;
+	if (tail != NULL) {
+		if (tail[0] != '@') {
+			dsl_dir_close(dd, FTAG);
+			return (ENOENT);
+		}
+		tail++;
+		/* Just blow away the snapshot */
+		do {
+			txg_wait_synced(dp, 0);
+			err = dsl_dir_sync_task(dd,
+			    dsl_dataset_destroy_sync, (void*)tail, 0);
+		} while (err == EAGAIN);
+		dsl_dir_close(dd, FTAG);
+	} else {
+		char buf[MAXNAMELEN];
+		char *cp;
+
+		dsl_dir_t *pds;
+		if (dd->dd_phys->dd_parent_obj == 0) {
+			dsl_dir_close(dd, FTAG);
+			return (EINVAL);
+		}
+		/*
+		 * Make sure it's not dirty before we destroy it.
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
+		/*
+		 * Blow away the dsl_dir + head dataset.
+		 * dsl_dir_destroy_sync() will call
+		 * dsl_dataset_destroy_sync() to destroy the head dataset.
+		 */
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		pds = dsl_dir_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+		dsl_dir_close(dd, FTAG);
+		rw_exit(&dp->dp_config_rwlock);
+
+		(void) strcpy(buf, name);
+		cp = strrchr(buf, '/') + 1;
+		ASSERT(cp[0] != '\0');
+		do {
+			txg_wait_synced(dp, 0);
+			err = dsl_dir_sync_task(pds,
+			    dsl_dir_destroy_sync, cp, 0);
+		} while (err == EAGAIN);
+		dsl_dir_close(pds, FTAG);
+	}
+
+	return (err);
+}
+
+int
+dsl_dataset_rollback(const char *name)
+{
+	int err;
+	dsl_dir_t *dd;
+	const char *tail;
+
+	dd = dsl_dir_open(name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	if (tail != NULL) {
+		dsl_dir_close(dd, FTAG);
+		return (EINVAL);
+	}
+	do {
+		txg_wait_synced(dd->dd_pool, 0);
+		err = dsl_dir_sync_task(dd,
+		    dsl_dataset_rollback_sync, NULL, 0);
+	} while (err == EAGAIN);
+	dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+    void *p, dsl_dataset_evict_func_t func)
+{
+	void *old;
+
+	mutex_enter(&ds->ds_lock);
+	old = ds->ds_user_ptr;
+	if (old == NULL) {
+		ds->ds_user_ptr = p;
+		ds->ds_user_evict_func = func;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+	return (ds->ds_user_ptr);
+}
+
+
+void
+dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
+{
+	*bp = ds->ds_phys->ds_bp;
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* If it's the meta-objset, set dp_meta_rootbp */
+	if (ds == NULL) {
+		tx->tx_pool->dp_meta_rootbp = *bp;
+	} else {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_bp = *bp;
+	}
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp;
+
+	if (ds == NULL) /* this is the meta-objset */
+		return;
+
+	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+	dp = ds->ds_dir->dd_pool;
+
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+	}
+}
+
+struct killarg {
+	uint64_t *usedp;
+	uint64_t *compressedp;
+	uint64_t *uncompressedp;
+	zio_t *zio;
+	dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	struct killarg *ka = arg;
+	blkptr_t *bp = &bc->bc_blkptr;
+
+	ASSERT3U(bc->bc_errno, ==, 0);
+
+	/*
+	 * Since this callback is not called concurrently, no lock is
+	 * needed on the accounting values.
+	 */
+	*ka->usedp += BP_GET_ASIZE(bp);
+	*ka->compressedp += BP_GET_PSIZE(bp);
+	*ka->uncompressedp += BP_GET_UCSIZE(bp);
+	/* XXX check for EIO? */
+	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+	    ARC_NOWAIT);
+	return (0);
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_dataset_t *ds;
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+	ds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+	if (ds->ds_phys->ds_prev_snap_txg == 0) {
+		/*
+		 * There's no previous snapshot.  I suppose we could
+		 * roll it back to being empty (and re-initialize the
+		 * upper (ZPL) layer).  But for now there's no way to do
+		 * this via the user interface.
+		 */
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_open_refcount > 0) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EBUSY);
+	}
+
+	/*
+	 * If we made changes this txg, traverse_dsl_dataset won't find
+	 * them.  Try again.
+	 */
+	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EAGAIN);
+	}
+
+	/* THE POINT OF NO (unsuccessful) RETURN */
+	ds->ds_open_refcount = DOS_REF_MAX;
+	mutex_exit(&ds->ds_lock);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	/* Zero out the deadlist. */
+	dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+	bplist_close(&ds->ds_deadlist);
+	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	ds->ds_phys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+
+	{
+		/* Free blkptrs that we gave birth to */
+		zio_t *zio;
+		uint64_t used = 0, compressed = 0, uncompressed = 0;
+		struct killarg ka;
+
+		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+		    ZIO_FLAG_MUSTSUCCEED);
+		ka.usedp = &used;
+		ka.compressedp = &compressed;
+		ka.uncompressedp = &uncompressed;
+		ka.zio = zio;
+		ka.tx = tx;
+		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+		    ADVANCE_POST, kill_blkptr, &ka);
+		(void) zio_wait(zio);
+
+		dsl_dir_diduse_space(dd,
+		    -used, -compressed, -uncompressed, tx);
+	}
+
+	/* Change our contents to that of the prev snapshot (finally!) */
+	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
+	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
+	ds->ds_phys->ds_compressed_bytes =
+	    ds->ds_prev->ds_phys->ds_compressed_bytes;
+	ds->ds_phys->ds_uncompressed_bytes =
+	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+	ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring;
+	ds->ds_phys->ds_unique_bytes = 0;
+
+	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+	ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+
+	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+	ds->ds_open_refcount = 0;
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	return (0);
+}
+
+int
+dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *snapname = arg;
+	uint64_t used = 0, compressed = 0, uncompressed = 0;
+	blkptr_t bp;
+	zio_t *zio;
+	int err;
+	int after_branch_point = FALSE;
+	int drop_lock = FALSE;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds, *ds_prev = NULL;
+	uint64_t obj;
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+
+	if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) {
+		rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+		drop_lock = TRUE;
+	}
+
+	ds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL,
+	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+
+	if (snapname) {
+		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+		    snapname, 8, 1, &obj);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		if (err) {
+			if (drop_lock)
+				rw_exit(&dp->dp_config_rwlock);
+			return (err);
+		}
+
+		ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+		    DS_MODE_EXCLUSIVE, FTAG);
+	}
+	if (ds == NULL) {
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EBUSY);
+	}
+
+	obj = ds->ds_object;
+
+	/* Can't delete a branch point. */
+	if (ds->ds_phys->ds_num_children > 1) {
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    ds->ds_prev->ds_phys->ds_next_snap_obj == obj) {
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * If we made changes this txg, traverse_dsl_dataset won't find
+	 * them.  Try again.
+	 */
+	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EAGAIN);
+	}
+
+	/* THE POINT OF NO (unsuccessful) RETURN */
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		if (ds->ds_prev) {
+			ds_prev = ds->ds_prev;
+		} else {
+			ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+			    ds->ds_phys->ds_prev_snap_obj, NULL,
+			    DS_MODE_NONE, FTAG);
+		}
+		after_branch_point =
+		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    ds->ds_phys->ds_next_snap_obj == 0) {
+			/* This clone is toast. */
+			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+			ds_prev->ds_phys->ds_num_children--;
+		} else if (!after_branch_point) {
+			ds_prev->ds_phys->ds_next_snap_obj =
+			    ds->ds_phys->ds_next_snap_obj;
+		}
+	}
+
+	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		dsl_dataset_t *ds_next;
+		uint64_t itor = 0;
+
+		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+		ds_next = dsl_dataset_open_obj(dd->dd_pool,
+		    ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+		ds_next->ds_phys->ds_prev_snap_obj =
+		    ds->ds_phys->ds_prev_snap_obj;
+		ds_next->ds_phys->ds_prev_snap_txg =
+		    ds->ds_phys->ds_prev_snap_txg;
+		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+		/*
+		 * Transfer to our deadlist (which will become next's
+		 * new deadlist) any entries from next's current
+		 * deadlist which were born before prev, and free the
+		 * other entries.
+		 *
+		 * XXX we're doing this long task with the config lock held
+		 */
+		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+		    &bp) == 0) {
+			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+				bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+				if (ds_prev && !after_branch_point &&
+				    bp.blk_birth >
+				    ds_prev->ds_phys->ds_prev_snap_txg) {
+					ds_prev->ds_phys->ds_unique_bytes +=
+					    BP_GET_ASIZE(&bp);
+				}
+			} else {
+				used += BP_GET_ASIZE(&bp);
+				compressed += BP_GET_PSIZE(&bp);
+				uncompressed += BP_GET_UCSIZE(&bp);
+				/* XXX check return value? */
+				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+				    &bp, NULL, NULL, ARC_NOWAIT);
+			}
+		}
+
+		/* free next's deadlist */
+		bplist_close(&ds_next->ds_deadlist);
+		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+		/* set next's deadlist to our deadlist */
+		ds_next->ds_phys->ds_deadlist_obj =
+		    ds->ds_phys->ds_deadlist_obj;
+		bplist_open(&ds_next->ds_deadlist, mos,
+		    ds_next->ds_phys->ds_deadlist_obj);
+		ds->ds_phys->ds_deadlist_obj = 0;
+
+		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+			/*
+			 * Update next's unique to include blocks which
+			 * were previously shared by only this snapshot
+			 * and it.  Those blocks will be born after the
+			 * prev snap and before this snap, and will have
+			 * died after the next snap and before the one
+			 * after that (ie. be on the snap after next's
+			 * deadlist).
+			 *
+			 * XXX we're doing this long task with the
+			 * config lock held
+			 */
+			dsl_dataset_t *ds_after_next;
+
+			ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+			    ds_next->ds_phys->ds_next_snap_obj, NULL,
+			    DS_MODE_NONE, FTAG);
+			itor = 0;
+			while (bplist_iterate(&ds_after_next->ds_deadlist,
+			    &itor, &bp) == 0) {
+				if (bp.blk_birth >
+				    ds->ds_phys->ds_prev_snap_txg &&
+				    bp.blk_birth <=
+				    ds->ds_phys->ds_creation_txg) {
+					ds_next->ds_phys->ds_unique_bytes +=
+					    BP_GET_ASIZE(&bp);
+				}
+			}
+
+			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+			ASSERT3P(ds_next->ds_prev, ==, NULL);
+		} else {
+			/*
+			 * It would be nice to update the head dataset's
+			 * unique.  To do so we would have to traverse
+			 * it for blocks born after ds_prev, which is
+			 * pretty expensive just to maintain something
+			 * for debugging purposes.
+			 */
+			ASSERT3P(ds_next->ds_prev, ==, ds);
+			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+			    ds_next);
+			if (ds_prev) {
+				ds_next->ds_prev = dsl_dataset_open_obj(
+				    dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
+				    NULL, DS_MODE_NONE, ds_next);
+			} else {
+				ds_next->ds_prev = NULL;
+			}
+		}
+		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+		/*
+		 * NB: unique_bytes is not accurate for head objsets
+		 * because we don't update it when we delete the most
+		 * recent snapshot -- see above comment.
+		 */
+		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+	} else {
+		/*
+		 * There's no next snapshot, so this is a head dataset.
+		 * Destroy the deadlist.  Unless it's a clone, the
+		 * deadlist should be empty.  (If it's a clone, it's
+		 * safe to ignore the deadlist contents.)
+		 */
+		struct killarg ka;
+
+		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+		bplist_close(&ds->ds_deadlist);
+		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+		ds->ds_phys->ds_deadlist_obj = 0;
+
+		/*
+		 * Free everything that we point to (that's born after
+		 * the previous snapshot, if we are a clone)
+		 *
+		 * XXX we're doing this long task with the config lock held
+		 */
+		ka.usedp = &used;
+		ka.compressedp = &compressed;
+		ka.uncompressedp = &uncompressed;
+		ka.zio = zio;
+		ka.tx = tx;
+		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+		    ADVANCE_POST, kill_blkptr, &ka);
+		ASSERT3U(err, ==, 0);
+	}
+
+	err = zio_wait(zio);
+	ASSERT3U(err, ==, 0);
+
+	dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx);
+
+	if (ds->ds_phys->ds_snapnames_zapobj) {
+		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+		ASSERT(err == 0);
+	}
+
+	if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+		/* Erase the link in the dataset */
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dd->dd_phys->dd_head_dataset_obj = 0;
+		/*
+		 * dsl_dir_sync_destroy() called us, they'll destroy
+		 * the dataset.
+		 */
+	} else {
+		/* remove from snapshot namespace */
+		dsl_dataset_t *ds_head;
+		ds_head = dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+#ifdef ZFS_DEBUG
+		{
+			uint64_t val;
+			err = zap_lookup(mos,
+			    ds_head->ds_phys->ds_snapnames_zapobj,
+			    snapname, 8, 1, &val);
+			ASSERT3U(err, ==, 0);
+			ASSERT3U(val, ==, obj);
+		}
+#endif
+		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
+		    snapname, tx);
+		ASSERT(err == 0);
+		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+	}
+
+	if (ds_prev && ds->ds_prev != ds_prev)
+		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+	err = dmu_object_free(mos, obj, tx);
+	ASSERT(err == 0);
+
+	/*
+	 * Close the objset with mode NONE, thus leaving it with
+	 * DOS_REF_MAX set, so that noone can access it.
+	 */
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	if (drop_lock)
+		rw_exit(&dp->dp_config_rwlock);
+	return (0);
+}
+
+int
+dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *snapname = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj, value;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds;
+	int err;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+	ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+	    DS_MODE_NONE, FTAG);
+
+	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+	    snapname, 8, 1, &value);
+	if (err == 0) {
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EEXIST);
+	}
+	ASSERT(err == ENOENT);
+
+	/* The point of no (unsuccessful) return */
+
+	dprintf_dd(dd, "taking snapshot %s in txg %llu\n",
+	    snapname, tx->tx_txg);
+
+	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	dsphys->ds_next_snap_obj = ds->ds_object;
+	dsphys->ds_num_children = 1;
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
+	dsphys->ds_bp = ds->ds_phys->ds_bp;
+	dmu_buf_rele(dbuf);
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *ds_prev;
+
+		ds_prev = dsl_dataset_open_obj(dp,
+		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
+		    ds->ds_object ||
+		    ds_prev->ds_phys->ds_num_children > 1);
+		if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+			dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+			    ds_prev->ds_phys->ds_creation_txg);
+			ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+		}
+		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+	} else {
+		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0);
+	}
+
+	bplist_close(&ds->ds_deadlist);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+	ds->ds_phys->ds_prev_snap_obj = dsobj;
+	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+	ds->ds_phys->ds_unique_bytes = 0;
+	ds->ds_phys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+
+	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx);
+	ASSERT(err == 0);
+
+	if (ds->ds_prev)
+		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+	ds->ds_prev = dsl_dataset_open_obj(dp,
+	    ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	return (0);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+	dmu_objset_sync(ds->ds_user_ptr, tx);
+	dsl_dir_dirty(ds->ds_dir, tx);
+	bplist_close(&ds->ds_deadlist);
+
+	dmu_buf_remove_ref(ds->ds_dbuf, ds);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
+{
+	/* fill in properties crap */
+	dsl_dir_stats(ds->ds_dir, dds);
+
+	if (ds->ds_phys->ds_num_children != 0) {
+		dds->dds_is_snapshot = TRUE;
+		dds->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+	}
+
+	dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth;
+
+	dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill;
+	dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used;
+
+	/* We override the dataset's creation time... they should be the same */
+	dds->dds_creation_time = ds->ds_phys->ds_creation_time;
+	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
+	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
+	dds->dds_guid = ds->ds_phys->ds_guid;
+
+	if (ds->ds_phys->ds_next_snap_obj) {
+		/*
+		 * This is a snapshot; override the dd's space used with
+		 * our unique space
+		 */
+		dds->dds_space_used = ds->ds_phys->ds_unique_bytes;
+		dds->dds_compressed_bytes =
+		    ds->ds_phys->ds_compressed_bytes;
+		dds->dds_uncompressed_bytes =
+		    ds->ds_phys->ds_uncompressed_bytes;
+	}
+
+	dds->dds_objset_obj = ds->ds_object;
+}
+
+dsl_pool_t *
+dsl_dataset_pool(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool);
+}
+
+struct osrenamearg {
+	const char *oldname;
+	const char *newname;
+};
+
+static int
+dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct osrenamearg *ora = arg;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_dir_t *nds;
+	const char *tail;
+	int err;
+	dsl_dataset_t *snds, *fsds;
+	uint64_t val;
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname,
+	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds);
+	if (err)
+		return (err);
+
+	if (snds->ds_dir != dd) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* better be changing a snapshot */
+	if (snds->ds_phys->ds_next_snap_obj == 0) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* new fs better exist */
+	nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
+	if (nds == NULL) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (ENOENT);
+	}
+
+	dsl_dir_close(nds, FTAG);
+
+	/* new name better be in same fs */
+	if (nds != dd) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* new name better be a snapshot */
+	if (tail == NULL || tail[0] != '@') {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	tail++;
+
+	fsds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+	/* new name better not be in use */
+	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    tail, 8, 1, &val);
+	if (err != ENOENT) {
+		if (err == 0)
+			err = EEXIST;
+		dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EEXIST);
+	}
+
+	/* The point of no (unsuccessful) return */
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+	dsl_dataset_get_snapname(snds);
+	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    snds->ds_snapname, tx);
+	ASSERT3U(err, ==, 0);
+	mutex_enter(&snds->ds_lock);
+	(void) strcpy(snds->ds_snapname, tail);
+	mutex_exit(&snds->ds_lock);
+	err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    snds->ds_snapname, 8, 1, &snds->ds_object, tx);
+	ASSERT3U(err, ==, 0);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+	dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+	return (0);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(const char *osname, const char *newname)
+{
+	dsl_dir_t *dd;
+	const char *tail;
+	struct osrenamearg ora;
+	int err;
+
+	dd = dsl_dir_open(osname, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+	if (tail == NULL) {
+		err = dsl_dir_sync_task(dd,
+		    dsl_dir_rename_sync, (void*)newname, 1<<12);
+		dsl_dir_close(dd, FTAG);
+		return (err);
+	}
+	if (tail[0] != '@') {
+		/* the name ended in a nonexistant component */
+		dsl_dir_close(dd, FTAG);
+		return (ENOENT);
+	}
+
+	ora.oldname = osname;
+	ora.newname = newname;
+
+	err = dsl_dir_sync_task(dd,
+	    dsl_dataset_snapshot_rename_sync, &ora, 1<<12);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 0000000000..3b0d32de70
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,1217 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd);
+static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static int dsl_dir_set_reservation_sync(dsl_dir_t *dd,
+    void *arg, dmu_tx_t *tx);
+static uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+    dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+	dsl_dir_t *dd = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+		ASSERT(dd->dd_tempreserved[t] == 0);
+		ASSERT(dd->dd_space_towrite[t] == 0);
+	}
+
+	ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+	ASSERT(dd->dd_sync_txg == 0);
+
+	if (dd->dd_parent)
+		dsl_dir_close(dd->dd_parent, dd);
+
+	spa_close(dd->dd_pool->dp_spa, dd);
+
+	/*
+	 * The props callback list should be empty since they hold the
+	 * dir open.
+	 */
+	list_destroy(&dd->dd_prop_cbs);
+	kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+dsl_dir_t *
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag)
+{
+	dmu_buf_t *dbuf;
+	dsl_dir_t *dd;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+
+	dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
+	dmu_buf_read(dbuf);
+	dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(dbuf, &doi);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET);
+	}
+#endif
+	/* XXX assert bonus buffer size is correct */
+	if (dd == NULL) {
+		dsl_dir_t *winner;
+		int err;
+
+		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+		dd->dd_object = ddobj;
+		dd->dd_dbuf = dbuf;
+		dd->dd_pool = dp;
+		dd->dd_phys = dbuf->db_data;
+		dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+
+		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+		    offsetof(dsl_prop_cb_record_t, cbr_node));
+
+		if (dd->dd_phys->dd_parent_obj) {
+			dd->dd_parent = dsl_dir_open_obj(dp,
+			    dd->dd_phys->dd_parent_obj, NULL, dd);
+			if (tail) {
+#ifdef ZFS_DEBUG
+				uint64_t foundobj;
+
+				err = zap_lookup(dp->dp_meta_objset,
+				    dd->dd_parent->dd_phys->
+				    dd_child_dir_zapobj,
+				    tail, sizeof (foundobj), 1, &foundobj);
+				ASSERT3U(err, ==, 0);
+				ASSERT3U(foundobj, ==, ddobj);
+#endif
+				(void) strcpy(dd->dd_myname, tail);
+			} else {
+				err = zap_value_search(dp->dp_meta_objset,
+				    dd->dd_parent->dd_phys->
+				    dd_child_dir_zapobj,
+				    ddobj, dd->dd_myname);
+				/*
+				 * The caller should be protecting this ddobj
+				 * from being deleted concurrently
+				 */
+				ASSERT(err == 0);
+			}
+		} else {
+			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+		}
+
+		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+		    dsl_dir_evict);
+		if (winner) {
+			if (dd->dd_parent)
+				dsl_dir_close(dd->dd_parent, dd);
+			kmem_free(dd, sizeof (dsl_dir_t));
+			dd = winner;
+		} else {
+			spa_open_ref(dp->dp_spa, dd);
+		}
+	}
+
+	/*
+	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
+	 * holds on the spa.  We need the open-to-close holds because
+	 * otherwise the spa_refcnt wouldn't change when we open a
+	 * dir which the spa also has open, so we could incorrectly
+	 * think it was OK to unload/export/destroy the pool.  We need
+	 * the instantiate-to-evict hold because the dsl_dir_t has a
+	 * pointer to the dd_pool, which has a pointer to the spa_t.
+	 */
+	spa_open_ref(dp->dp_spa, tag);
+	ASSERT3P(dd->dd_pool, ==, dp);
+	ASSERT3U(dd->dd_object, ==, ddobj);
+	ASSERT3P(dd->dd_dbuf, ==, dbuf);
+	return (dd);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele_tag(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+	if (dd->dd_parent) {
+		dsl_dir_name(dd->dd_parent, buf);
+		(void) strcat(buf, "/");
+	} else {
+		buf[0] = '\0';
+	}
+	if (!MUTEX_HELD(&dd->dd_lock)) {
+		/*
+		 * recursive mutex so that we can use
+		 * dprintf_dd() with dd_lock held
+		 */
+		mutex_enter(&dd->dd_lock);
+		(void) strcat(buf, dd->dd_myname);
+		mutex_exit(&dd->dd_lock);
+	} else {
+		(void) strcat(buf, dd->dd_myname);
+	}
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+	int rv = FALSE;
+
+	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+		rv = TRUE;
+	if (dataset_name_hidden(dd->dd_myname))
+		rv = TRUE;
+	return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+	char *p;
+	if (path == NULL)
+		return (NULL);
+	/* This would be a good place to reserve some namespace... */
+	p = strpbrk(path, "/@");
+	if (p && (p[1] == '/' || p[1] == '@')) {
+		/* two separators in a row */
+		return (EINVAL);
+	}
+	if (p == NULL || p == path) {
+		/*
+		 * if the first thing is an @ or /, it had better be an
+		 * @ and it had better not have any more ats or slashes,
+		 * and it had better have something after the @.
+		 */
+		if (p != NULL &&
+		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+			return (EINVAL);
+		if (strlen(path) >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strcpy(component, path);
+		p = NULL;
+	} else if (p[0] == '/') {
+		if (p-path >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(component, path, p - path);
+		component[p-path] = '\0';
+		p++;
+	} else if (p[0] == '@') {
+		/*
+		 * if the next separator is an @, there better not be
+		 * any more slashes.
+		 */
+		if (strchr(path, '/'))
+			return (EINVAL);
+		if (p-path >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(component, path, p - path);
+		component[p-path] = '\0';
+	} else {
+		ASSERT(!"invalid p");
+	}
+	*nextp = p;
+	return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+dsl_dir_t *
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+{
+	char buf[MAXNAMELEN];
+	const char *next, *nextnext = NULL;
+	int err;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	uint64_t ddobj;
+	int openedspa = FALSE;
+
+	dprintf("%s\n", name);
+
+	if (name == NULL)
+		return (NULL);
+	err = getcomponent(name, buf, &next);
+	if (err)
+		return (NULL);
+	if (spa == NULL) {
+		err = spa_open(buf, &spa, FTAG);
+		if (err) {
+			dprintf("spa_open(%s) failed\n", buf);
+			return (NULL);
+		}
+		openedspa = TRUE;
+
+		/* XXX this assertion belongs in spa_open */
+		ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+	}
+
+	dp = spa_get_dsl(spa);
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+	while (next != NULL) {
+		dsl_dir_t *child_ds;
+		err = getcomponent(next, buf, &nextnext);
+		if (err) {
+			dsl_dir_close(dd, tag);
+			if (openedspa)
+				spa_close(spa, FTAG);
+			return (NULL);
+		}
+		ASSERT(next[0] != '\0');
+		if (next[0] == '@')
+			break;
+		if (dd->dd_phys->dd_child_dir_zapobj == 0)
+			break;
+		dprintf("looking up %s in obj%lld\n",
+		    buf, dd->dd_phys->dd_child_dir_zapobj);
+
+		err = zap_lookup(dp->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj,
+		    buf, sizeof (ddobj), 1, &ddobj);
+		if (err == ENOENT) {
+			break;
+		}
+		ASSERT(err == 0);
+
+		child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+		dsl_dir_close(dd, tag);
+		dd = child_ds;
+		next = nextnext;
+	}
+	rw_exit(&dp->dp_config_rwlock);
+
+	/*
+	 * It's an error if there's more than one component left, or
+	 * tailp==NULL and there's any component left.
+	 */
+	if (next != NULL &&
+	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+		/* bad path name */
+		dsl_dir_close(dd, tag);
+		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+		next = NULL;
+		dd = NULL;
+	}
+	if (tailp)
+		*tailp = next;
+	if (openedspa)
+		spa_close(spa, FTAG);
+	return (dd);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+dsl_dir_t *
+dsl_dir_open(const char *name, void *tag, const char **tailp)
+{
+	return (dsl_dir_open_spa(NULL, name, tag, tailp));
+}
+
+int
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+	objset_t *mos = pds->dd_pool->dp_meta_objset;
+	uint64_t ddobj;
+	dsl_dir_phys_t *dsphys;
+	dmu_buf_t *dbuf;
+	int err;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	if (pds->dd_phys->dd_child_dir_zapobj == 0) {
+		dmu_buf_will_dirty(pds->dd_dbuf, tx);
+		pds->dd_phys->dd_child_dir_zapobj = zap_create(mos,
+		    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	}
+
+	rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER);
+	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj,
+	    name, sizeof (uint64_t), 1, &ddobj);
+	if (err != ENOENT) {
+		rw_exit(&pds->dd_pool->dp_config_rwlock);
+		return (err ? err : EEXIST);
+	}
+
+	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+	err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+	    name, sizeof (uint64_t), 1, &ddobj, tx);
+	ASSERT3U(err, ==, 0);
+	dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
+	    name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
+
+	dbuf = dmu_bonus_hold(mos, ddobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+
+	dsphys->dd_creation_time = gethrestime_sec();
+	dsphys->dd_parent_obj = pds->dd_object;
+	dsphys->dd_props_zapobj = zap_create(mos,
+	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+	dsphys->dd_child_dir_zapobj = zap_create(mos,
+	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	dmu_buf_rele(dbuf);
+
+	rw_exit(&pds->dd_pool->dp_config_rwlock);
+
+	return (0);
+}
+
+int
+dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
+{
+	const char *name = arg;
+	dsl_dir_t *dd = NULL;
+	dsl_pool_t *dp = pds->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t val, obj, child_zapobj, props_zapobj;
+	int t, err;
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name,
+	    8, 1, &obj);
+	if (err)
+		goto out;
+
+	dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+	ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
+
+	if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
+		err = EBUSY;
+		goto out;
+	}
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		/*
+		 * if they were dirty, they'd also be open.
+		 * dp_config_rwlock ensures that it stays that way.
+		 */
+		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+	}
+
+	child_zapobj = dd->dd_phys->dd_child_dir_zapobj;
+	props_zapobj = dd->dd_phys->dd_props_zapobj;
+
+	if (child_zapobj != 0) {
+		uint64_t count;
+		err = EEXIST;
+		(void) zap_count(mos, child_zapobj, &count);
+		if (count != 0)
+			goto out;
+	}
+
+	if (dd->dd_phys->dd_head_dataset_obj != 0) {
+		err = dsl_dataset_destroy_sync(dd, NULL, tx);
+		if (err)
+			goto out;
+	}
+	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+	/* The point of no (unsuccessful) return */
+
+	/* Make sure parent's used gets updated */
+	val = 0;
+	err = dsl_dir_set_reservation_sync(dd, &val, tx);
+	ASSERT(err == 0);
+	ASSERT3U(dd->dd_used_bytes, ==, 0);
+	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+	dsl_dir_close(dd, FTAG);
+	dd = NULL;
+
+	err = dmu_object_free(mos, obj, tx);
+	ASSERT(err == 0);
+
+	if (child_zapobj)
+		err = zap_destroy(mos, child_zapobj, tx);
+	ASSERT(err == 0);
+
+	if (props_zapobj)
+		err = zap_destroy(mos, props_zapobj, tx);
+	ASSERT(err == 0);
+
+	err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx);
+	ASSERT(err == 0);
+
+out:
+	rw_exit(&dp->dp_config_rwlock);
+	if (dd)
+		dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+	dsl_dir_phys_t *dsp;
+	dmu_buf_t *dbuf;
+	int error;
+
+	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+
+	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+	    sizeof (uint64_t), 1, ddobjp, tx);
+	ASSERT3U(error, ==, 0);
+
+	dbuf = dmu_bonus_hold(mos, *ddobjp);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsp = dbuf->db_data;
+
+	dsp->dd_creation_time = gethrestime_sec();
+	dsp->dd_props_zapobj = zap_create(mos,
+	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+	dsp->dd_child_dir_zapobj = zap_create(mos,
+	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+	dmu_buf_rele(dbuf);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
+{
+	bzero(dds, sizeof (dmu_objset_stats_t));
+
+	dds->dds_dir_obj = dd->dd_object;
+	dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
+
+	mutex_enter(&dd->dd_lock);
+	dds->dds_space_used = dd->dd_used_bytes;
+	dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes;
+	dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes;
+	dds->dds_quota = dd->dd_phys->dd_quota;
+	dds->dds_reserved = dd->dd_phys->dd_reserved;
+	mutex_exit(&dd->dd_lock);
+
+	dds->dds_creation_time = dd->dd_phys->dd_creation_time;
+
+	dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
+
+	if (dd->dd_phys->dd_clone_parent_obj) {
+		dsl_dataset_t *ds;
+
+		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+		ds = dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+		dsl_dataset_name(ds, dds->dds_clone_of);
+		dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		rw_exit(&dd->dd_pool->dp_config_rwlock);
+	}
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "checksum",
+	    &dds->dds_checksum, dds->dds_checksum_setpoint) == 0);
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "compression",
+	    &dds->dds_compression, dds->dds_compression_setpoint) == 0);
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "zoned",
+	    &dds->dds_zoned, dds->dds_zoned_setpoint) == 0);
+
+	spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
+	    sizeof (dds->dds_altroot));
+}
+
+int
+dsl_dir_sync_task(dsl_dir_t *dd,
+    int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space)
+{
+	dmu_tx_t *tx;
+	dsl_pool_t *dp = dd->dd_pool;
+	int err = 0;
+	uint64_t txg;
+
+	dprintf_dd(dd, "func=%p space=%llu\n", func, space);
+
+again:
+	tx = dmu_tx_create_ds(dd);
+	dmu_tx_hold_space(tx, space);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == ENOSPC || err == EDQUOT) {
+		dsl_dir_t *rds;
+		/*
+		 * They can get their space from either this dd, or the
+		 * root dd.
+		 */
+		for (rds = dd; rds->dd_parent; rds = rds->dd_parent)
+			continue;
+		dmu_tx_abort(tx);
+		tx = dmu_tx_create_ds(rds);
+		dmu_tx_hold_space(tx, space);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+	}
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	txg = dmu_tx_get_txg(tx);
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_sync_txg != 0) {
+		mutex_exit(&dd->dd_lock);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dp, 0);
+		goto again;
+	}
+
+	/* We're good to go */
+
+	dd->dd_sync_txg = txg;
+	dd->dd_sync_func = func;
+	dd->dd_sync_arg = arg;
+
+	mutex_exit(&dd->dd_lock);
+
+	dsl_dir_dirty(dd, tx);
+	dmu_tx_commit(tx);
+
+	txg_wait_synced(dp, txg);
+
+	mutex_enter(&dd->dd_lock);
+	ASSERT(dd->dd_sync_txg == txg);
+	ASSERT(dd->dd_sync_func == NULL);
+	err = dd->dd_sync_err;
+	dd->dd_sync_txg = 0;
+	mutex_exit(&dd->dd_lock);
+
+	return (err);
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+
+	ASSERT(dd->dd_phys);
+
+	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(dd->dd_dbuf, dd);
+	}
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+	return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) {
+		dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx);
+		dd->dd_sync_func = NULL;
+	}
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+	dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+	mutex_exit(&dd->dd_lock);
+
+	/* release the hold from dsl_dir_dirty */
+	dmu_buf_remove_ref(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_estimated_space(dsl_dir_t *dd)
+{
+	int64_t space;
+	int i;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	space = dd->dd_used_bytes;
+	ASSERT(space >= 0);
+	for (i = 0; i < TXG_SIZE; i++) {
+		space += dd->dd_space_towrite[i&TXG_MASK];
+		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+	}
+	return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it?  If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+static uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+	uint64_t parentspace, myspace, quota, used;
+
+	/*
+	 * If there are no restrictions otherwise, assume we have
+	 * unlimited space available.
+	 */
+	quota = UINT64_MAX;
+	parentspace = UINT64_MAX;
+
+	if (dd->dd_parent != NULL) {
+		parentspace = dsl_dir_space_available(dd->dd_parent,
+		    ancestor, delta, ondiskonly);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_phys->dd_quota != 0)
+		quota = dd->dd_phys->dd_quota;
+	if (ondiskonly) {
+		used = dd->dd_used_bytes;
+	} else {
+		used = dsl_dir_estimated_space(dd);
+	}
+	if (dd == ancestor)
+		used += delta;
+
+	if (dd->dd_parent == NULL) {
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE);
+		quota = MIN(quota, poolsize);
+	}
+
+	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+		/*
+		 * We have some space reserved, in addition to what our
+		 * parent gave us.
+		 */
+		parentspace += dd->dd_phys->dd_reserved - used;
+	}
+
+	if (used > quota) {
+		/* over quota */
+		myspace = 0;
+#ifdef ZFS_DEBUG
+		{
+			/*
+			 * While it's OK to be a little over quota, if
+			 * we think we are using more space than there
+			 * is in the pool (which is already 6% more than
+			 * dsl_pool_adjustedsize()), something is very
+			 * wrong.
+			 */
+			uint64_t space = spa_get_space(dd->dd_pool->dp_spa);
+			ASSERT3U(used, <=, space);
+		}
+#endif
+	} else {
+		/*
+		 * the lesser of parent's space and the space
+		 * left in our quota
+		 */
+		myspace = MIN(parentspace, quota - used);
+	}
+
+	mutex_exit(&dd->dd_lock);
+
+	return (myspace);
+}
+
+struct tempreserve {
+	list_node_t tr_node;
+	dsl_dir_t *tr_ds;
+	uint64_t tr_size;
+};
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd,
+    uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+{
+	uint64_t txg = tx->tx_txg;
+	uint64_t est_used, quota, parent_rsrv;
+	int edquot = EDQUOT;
+	int txgidx = txg & TXG_MASK;
+	int i;
+	struct tempreserve *tr;
+
+	ASSERT3U(txg, !=, 0);
+
+	mutex_enter(&dd->dd_lock);
+	/*
+	 * Check against the dsl_dir's quota.  We don't add in the delta
+	 * when checking for over-quota because they get one free hit.
+	 */
+	est_used = dsl_dir_estimated_space(dd);
+	for (i = 0; i < TXG_SIZE; i++)
+		est_used += dd->dd_tempreserved[i];
+
+	quota = UINT64_MAX;
+
+	if (dd->dd_phys->dd_quota)
+		quota = dd->dd_phys->dd_quota;
+
+	/*
+	 * If this transaction will result in a net free of space, we want
+	 * to let it through, but we have to be careful: the space that it
+	 * frees won't become available until *after* this txg syncs.
+	 * Therefore, to ensure that it's possible to remove files from
+	 * a full pool without inducing transient overcommits, we throttle
+	 * netfree transactions against a quota that is slightly larger,
+	 * but still within the pool's allocation slop.  In cases where
+	 * we're very close to full, this will allow a steady trickle of
+	 * removes to get through.
+	 */
+	if (dd->dd_parent == NULL) {
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+		if (poolsize < quota) {
+			quota = poolsize;
+			edquot = ENOSPC;
+		}
+	} else if (netfree) {
+		quota = UINT64_MAX;
+	}
+
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota.  They get to try again unless the actual
+	 * on-disk is over quota.
+	 */
+	if (asize > 0 && est_used > quota) {
+		if (dd->dd_used_bytes < quota)
+			edquot = ERESTART;
+		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+		    "quota=%lluK tr=%lluK err=%d\n",
+		    dd->dd_used_bytes>>10, est_used>>10,
+		    quota>>10, asize>>10, edquot);
+		mutex_exit(&dd->dd_lock);
+		return (edquot);
+	}
+
+	/* We need to up our estimated delta before dropping dd_lock */
+	dd->dd_tempreserved[txgidx] += asize;
+
+	parent_rsrv = parent_delta(dd, est_used, asize);
+	mutex_exit(&dd->dd_lock);
+
+	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+	tr->tr_ds = dd;
+	tr->tr_size = asize;
+	list_insert_tail(tr_list, tr);
+
+	/* see if it's OK with our parent */
+	if (dd->dd_parent && parent_rsrv) {
+		return (dsl_dir_tempreserve_impl(dd->dd_parent,
+		    parent_rsrv, netfree, tr_list, tx));
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
+    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+{
+	int err = 0;
+	list_t *tr_list;
+
+	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+	list_create(tr_list, sizeof (struct tempreserve),
+	    offsetof(struct tempreserve, tr_node));
+
+	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+	    tr_list, tx);
+
+	if (err == 0) {
+		struct tempreserve *tr;
+
+		err = arc_tempreserve_space(lsize);
+		if (err == 0) {
+			tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+			tr->tr_ds = NULL;
+			tr->tr_size = lsize;
+			list_insert_tail(tr_list, tr);
+		}
+	}
+
+	if (err)
+		dsl_dir_tempreserve_clear(tr_list, tx);
+	else
+		*tr_cookiep = tr_list;
+	return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+	int txgidx = tx->tx_txg & TXG_MASK;
+	list_t *tr_list = tr_cookie;
+	struct tempreserve *tr;
+
+	ASSERT3U(tx->tx_txg, !=, 0);
+
+	while (tr = list_head(tr_list)) {
+		if (tr->tr_ds == NULL) {
+			arc_tempreserve_clear(tr->tr_size);
+		} else {
+			mutex_enter(&tr->tr_ds->dd_lock);
+			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+			    tr->tr_size);
+			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+			mutex_exit(&tr->tr_ds->dd_lock);
+		}
+		list_remove(tr_list, tr);
+		kmem_free(tr, sizeof (struct tempreserve));
+	}
+
+	kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data.  Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+	int64_t parent_space;
+	uint64_t est_used;
+
+	mutex_enter(&dd->dd_lock);
+	if (space > 0)
+		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+	est_used = dsl_dir_estimated_space(dd);
+	parent_space = parent_delta(dd, est_used, space);
+	mutex_exit(&dd->dd_lock);
+
+	/* Make sure that we clean up dd_space_to* */
+	dsl_dir_dirty(dd, tx);
+
+	/* XXX this is potentially expensive and unnecessary... */
+	if (parent_space && dd->dd_parent)
+		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+	int64_t accounted_delta;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dsl_dir_dirty(dd, tx);
+
+	mutex_enter(&dd->dd_lock);
+	accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+	ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+	ASSERT(compressed >= 0 ||
+	    dd->dd_phys->dd_compressed_bytes >= -compressed);
+	ASSERT(uncompressed >= 0 ||
+	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+	dd->dd_used_bytes += used;
+	if (used > 0)
+		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
+	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+	dd->dd_phys->dd_compressed_bytes += compressed;
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent != NULL) {
+		dsl_dir_diduse_space(dd->dd_parent,
+		    accounted_delta, compressed, uncompressed, tx);
+	}
+}
+
+static int
+dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	uint64_t *quotap = arg;
+	uint64_t new_quota = *quotap;
+	int err = 0;
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved ||
+	    new_quota < dsl_dir_estimated_space(dd))) {
+		err = ENOSPC;
+	} else {
+		dd->dd_phys->dd_quota = new_quota;
+	}
+	mutex_exit(&dd->dd_lock);
+	return (err);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+	dsl_dir_t *dd;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+	/*
+	 * If someone removes a file, then tries to set the quota, we
+	 * want to make sure the file freeing takes effect.
+	 */
+	txg_wait_open(dd->dd_pool, 0);
+
+	err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, &quota, 0);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+static int
+dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	uint64_t *reservationp = arg;
+	uint64_t new_reservation = *reservationp;
+	uint64_t used, avail;
+	int64_t delta;
+
+	if (new_reservation > INT64_MAX)
+		return (EOVERFLOW);
+
+	mutex_enter(&dd->dd_lock);
+	used = dd->dd_used_bytes;
+	delta = MAX(used, new_reservation) -
+	    MAX(used, dd->dd_phys->dd_reserved);
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent) {
+		avail = dsl_dir_space_available(dd->dd_parent,
+		    NULL, 0, FALSE);
+	} else {
+		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+	}
+
+	if (delta > 0 && delta > avail)
+		return (ENOSPC);
+	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+	    new_reservation > dd->dd_phys->dd_quota)
+		return (ENOSPC);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_reserved = new_reservation;
+
+	if (dd->dd_parent != NULL) {
+		/* Roll up this additional usage into our ancestors */
+		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+	}
+	return (0);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+	dsl_dir_t *dd;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+	err = dsl_dir_sync_task(dd,
+	    dsl_dir_set_reservation_sync, &reservation, 0);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+	for (; ds1; ds1 = ds1->dd_parent) {
+		dsl_dir_t *dd;
+		for (dd = ds2; dd; dd = dd->dd_parent) {
+			if (ds1 == dd)
+				return (dd);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor?  Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+	if (dd == ancestor)
+		return (delta);
+
+	mutex_enter(&dd->dd_lock);
+	delta = parent_delta(dd, dd->dd_used_bytes, delta);
+	mutex_exit(&dd->dd_lock);
+	return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+int
+dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *newname = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dir_t *newpds;
+	const char *tail;
+	int err, len;
+
+	/* can't rename to different pool */
+	len = strlen(dp->dp_root_dir->dd_myname);
+	if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) ||
+	    newname[len] != '/') {
+		return (ENXIO);
+	}
+
+	newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
+
+	/* new parent should exist */
+	if (newpds == NULL)
+		return (ENOENT);
+
+	/* new name should not already exist */
+	if (tail == NULL) {
+		dsl_dir_close(newpds, FTAG);
+		return (EEXIST);
+	}
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	/* There should be 2 references: the open and the dirty */
+	if (dmu_buf_refcount(dd->dd_dbuf) > 2) {
+		rw_exit(&dp->dp_config_rwlock);
+		dsl_dir_close(newpds, FTAG);
+		return (EBUSY);
+	}
+
+	if (newpds != dd->dd_parent) {
+		dsl_dir_t *ancestor;
+		int64_t adelta;
+		uint64_t myspace, avail;
+
+		ancestor = closest_common_ancestor(dd, newpds);
+
+		/* no rename into our descendent */
+		if (ancestor == dd) {
+			dsl_dir_close(newpds, FTAG);
+			rw_exit(&dp->dp_config_rwlock);
+			return (EINVAL);
+		}
+
+		myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+		adelta = would_change(dd->dd_parent, -myspace, ancestor);
+		avail = dsl_dir_space_available(newpds,
+		    ancestor, adelta, FALSE);
+		if (avail < myspace) {
+			dsl_dir_close(newpds, FTAG);
+			rw_exit(&dp->dp_config_rwlock);
+			return (ENOSPC);
+		}
+
+		/* The point of no (unsuccessful) return */
+
+		dsl_dir_diduse_space(dd->dd_parent, -myspace,
+		    -dd->dd_phys->dd_compressed_bytes,
+		    -dd->dd_phys->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(newpds, myspace,
+		    dd->dd_phys->dd_compressed_bytes,
+		    dd->dd_phys->dd_uncompressed_bytes, tx);
+	}
+
+	/* The point of no (unsuccessful) return */
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	/* remove from old parent zapobj */
+	err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+	    dd->dd_myname, tx);
+	ASSERT3U(err, ==, 0);
+
+	(void) strcpy(dd->dd_myname, tail);
+	dsl_dir_close(dd->dd_parent, dd);
+	dd->dd_phys->dd_parent_obj = newpds->dd_object;
+	dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
+	    newpds->dd_object, NULL, dd);
+
+	/* add to new parent zapobj */
+	err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
+	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
+	ASSERT3U(err, ==, 0);
+
+	dsl_dir_close(newpds, FTAG);
+	rw_exit(&dp->dp_config_rwlock);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 0000000000..5b71ccfaa9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+/* internal reserved dir name */
+#define	MOS_DIR_NAME "$MOS"
+
+static dsl_dir_t *
+dsl_pool_open_mos_dir(dsl_pool_t *dp)
+{
+	uint64_t obj;
+	int err;
+
+	err = zap_lookup(dp->dp_meta_objset,
+	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
+	ASSERT3U(err, ==, 0);
+
+	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp;
+	blkptr_t *bp = spa_get_rootblkptr(spa);
+
+	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+	dp->dp_spa = spa;
+	dp->dp_meta_rootbp = *bp;
+	txg_init(dp, txg);
+
+	txg_list_create(&dp->dp_dirty_datasets,
+	    offsetof(dsl_dataset_t, ds_dirty_link));
+	txg_list_create(&dp->dp_dirty_dirs,
+	    offsetof(dsl_dir_t, dd_dirty_link));
+	list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+	    offsetof(dsl_dataset_t, ds_synced_link));
+
+	return (dp);
+}
+
+dsl_pool_t *
+dsl_pool_open(spa_t *spa, uint64_t txg)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+	dp->dp_meta_objset =
+	    &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+	    &dp->dp_root_dir_obj);
+	ASSERT3U(err, ==, 0);
+
+	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp);
+	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	rw_exit(&dp->dp_config_rwlock);
+
+	return (dp);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+	/* drop our reference from dsl_pool_open() */
+	dsl_dir_close(dp->dp_mos_dir, dp);
+	dsl_dir_close(dp->dp_root_dir, dp);
+
+	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+	dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+	txg_list_destroy(&dp->dp_dirty_datasets);
+	txg_list_destroy(&dp->dp_dirty_dirs);
+	list_destroy(&dp->dp_synced_objsets);
+
+	arc_flush();
+	txg_fini(dp);
+	kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+	    NULL, DMU_OST_META, tx)->os;
+
+	/* create the pool directory */
+	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+	ASSERT3U(err, ==, 0);
+
+	/* create and open the root dir */
+	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp);
+
+	/* create and open the meta-objset dir */
+	err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
+	    tx);
+	ASSERT3U(err, ==, 0);
+	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+
+	dmu_tx_commit(tx);
+
+	return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	objset_impl_t *mosi = dp->dp_meta_objset->os;
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	do {
+		dsl_dir_t *dd;
+		dsl_dataset_t *ds;
+
+		while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+			if (!list_link_active(&ds->ds_synced_link))
+				list_insert_tail(&dp->dp_synced_objsets, ds);
+			dsl_dataset_sync(ds, tx);
+		}
+		while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+			dsl_dir_sync(dd, tx);
+		/*
+		 * We need to loop since dsl_dir_sync() could create a
+		 * new (dirty) objset.
+		 * XXX - isn't this taken care of by the spa's sync to
+		 * convergence loop?
+		 */
+	} while (!txg_list_empty(&dp->dp_dirty_datasets, txg));
+
+	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+		dmu_objset_sync(mosi, tx);
+		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+	}
+
+	dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+	dsl_dataset_t *ds;
+
+	while (ds = list_head(&dp->dp_synced_objsets)) {
+		list_remove(&dp->dp_synced_objsets, ds);
+		ASSERT(ds->ds_user_ptr != NULL);
+		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+	}
+}
+
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+	/*
+	 * Yeah, this is cheesy.  But the SPA needs some way to let
+	 * the sync threads invoke spa_open() and spa_close() while
+	 * it holds the namespace lock.  I'm certainly open to better
+	 * ideas for how to determine whether the current thread is
+	 * operating on behalf of spa_sync().  This works for now.
+	 */
+	return (curthread == dp->dp_tx.tx_sync_thread ||
+	    BP_IS_HOLE(&dp->dp_meta_rootbp));
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+	uint64_t space, resv;
+
+	/*
+	 * Reserve about 1% (1/128), or at least 16MB, for allocation
+	 * efficiency.
+	 * XXX The intent log is not accounted for, so it must fit
+	 * within this slop.
+	 *
+	 * If we're trying to assess whether it's OK to do a free,
+	 * cut the reservation in half to allow forward progress
+	 * (e.g. make it possible to rm(1) files from a full pool).
+	 */
+	space = spa_get_space(dp->dp_spa);
+	resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+	if (netfree)
+		resv >>= 1;
+
+	return (space - resv);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 0000000000..bd54263507
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+	zfs_prop_t prop;
+
+	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
+	    zfs_prop_readonly(prop))
+		return (ENOENT);
+
+	if (zfs_prop_get_type(prop) == prop_type_string) {
+		if (intsz != 1)
+			return (EOVERFLOW);
+		zfs_prop_default_string(prop, buf, numint);
+	} else {
+		if (intsz != 8 || numint < 1)
+			return (EOVERFLOW);
+
+		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
+	}
+
+	return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
+    int intsz, int numint, void *buf, char *setpoint)
+{
+	int err = 0;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (setpoint)
+		setpoint[0] = '\0';
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
+
+	while (ddobj != 0) {
+		dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+		    propname, intsz, numint, buf);
+		if (err != ENOENT) {
+			if (setpoint)
+				dsl_dir_name(dd, setpoint);
+			dsl_dir_close(dd, FTAG);
+			break;
+		}
+		ASSERT3U(err, ==, ENOENT);
+		ddobj = dd->dd_phys->dd_parent_obj;
+		dsl_dir_close(dd, FTAG);
+	}
+	if (err == ENOENT)
+		err = dodefault(propname, intsz, numint, buf);
+
+	return (err);
+}
+
+/*
+ * Register interest in the named property.  We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd;
+	uint64_t value;
+	dsl_prop_cb_record_t *cbr;
+	int err;
+
+	dd = ds->ds_dir;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+	err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object, propname,
+	    8, 1, &value, NULL);
+	if (err == ENOENT) {
+		err = 0;
+		value = DSL_PROP_VALUE_UNDEFINED;
+	}
+	if (err != 0) {
+		rw_exit(&dd->dd_pool->dp_config_rwlock);
+		return (err);
+	}
+
+	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+	(void) strcpy((char *)cbr->cbr_propname, propname);
+	cbr->cbr_func = callback;
+	cbr->cbr_arg = cbarg;
+	mutex_enter(&dd->dd_lock);
+	list_insert_head(&dd->dd_prop_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+
+	cbr->cbr_func(cbr->cbr_arg, value);
+
+	(void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+	/* Leave dataset open until this callback is unregistered */
+	return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	int err;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object,
+	    propname, intsz, numints, buf, setpoint);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	return (err);
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	dsl_dir_t *dd;
+	const char *tail;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+	if (tail && tail[0] != '@') {
+		dsl_dir_close(dd, FTAG);
+		return (ENOENT);
+	}
+
+	err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+/*
+ * Return 0 on success, ENOENT if ddname is invalid, EOVERFLOW if
+ * valuelen not big enough.
+ */
+int
+dsl_prop_get_string(const char *ddname, const char *propname,
+    char *value, int valuelen, char *setpoint)
+{
+	return (dsl_prop_get(ddname, propname, 1, valuelen, value, setpoint));
+}
+
+/*
+ * Get the current property value.  It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+    uint64_t *valuep, char *setpoint)
+{
+	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+    uint64_t *valuep, char *setpoint)
+{
+	return (dsl_prop_get_ds(dd, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback.  Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd;
+	dsl_prop_cb_record_t *cbr;
+
+	dd = ds->ds_dir;
+
+	mutex_enter(&dd->dd_lock);
+	for (cbr = list_head(&dd->dd_prop_cbs);
+	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+		if (strcmp(cbr->cbr_propname, propname) == 0 &&
+		    cbr->cbr_func == callback &&
+		    cbr->cbr_arg == cbarg)
+			break;
+	}
+
+	if (cbr == NULL) {
+		mutex_exit(&dd->dd_lock);
+		return (ENOMSG);
+	}
+
+	list_remove(&dd->dd_prop_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+	/* Clean up from dsl_prop_register */
+	dsl_dir_close(dd, cbr);
+	return (0);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+    const char *propname, uint64_t value, int first)
+{
+	dsl_dir_t *dd;
+	dsl_prop_cb_record_t *cbr;
+	objset_t *mos = dp->dp_meta_objset;
+	int err;
+
+	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+	dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+
+	if (!first) {
+		/*
+		 * If the prop is set here, then this change is not
+		 * being inherited here or below; stop the recursion.
+		 */
+		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+		    8, 1, &value);
+		if (err == 0) {
+			dsl_dir_close(dd, FTAG);
+			return;
+		}
+		ASSERT3U(err, ==, ENOENT);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	for (cbr = list_head(&dd->dd_prop_cbs);
+	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+		if (strcmp(cbr->cbr_propname, propname) == 0) {
+			cbr->cbr_func(cbr->cbr_arg, value);
+		}
+	}
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_phys->dd_child_dir_zapobj) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, mos,
+		    dd->dd_phys->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			/* XXX recursion could blow stack; esp. za! */
+			dsl_prop_changed_notify(dp, za.za_first_integer,
+			    propname, value, FALSE);
+		}
+	}
+	dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+	const char *name;
+	int intsz;
+	int numints;
+	const void *buf;
+};
+
+static int
+dsl_prop_set_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct prop_set_arg *psa = arg;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+	uint64_t intval;
+	int err, isint;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+
+	isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+	if (psa->numints == 0) {
+		err = zap_remove(mos, zapobj, psa->name, tx);
+		if (err == ENOENT) /* that's fine. */
+			err = 0;
+		if (err == 0 && isint) {
+			err = dsl_prop_get_impl(dd->dd_pool,
+			    dd->dd_phys->dd_parent_obj, psa->name,
+			    8, 1, &intval, NULL);
+		}
+	} else {
+		err = zap_update(mos, zapobj, psa->name,
+		    psa->intsz, psa->numints, psa->buf, tx);
+		if (isint)
+			intval = *(uint64_t *)psa->buf;
+	}
+
+	if (err == 0 && isint) {
+		dsl_prop_changed_notify(dd->dd_pool,
+		    dd->dd_object, psa->name, intval, TRUE);
+	}
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	return (err);
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+    int intsz, int numints, const void *buf)
+{
+	dsl_dir_t *dd;
+	int err;
+	struct prop_set_arg psa;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+
+	psa.name = propname;
+	psa.intsz = intsz;
+	psa.numints = numints;
+	psa.buf = buf;
+	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+
+	dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
new file mode 100644
index 0000000000..03186d1387
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += ip[0];
+		a1 += ip[1];
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += BSWAP_64(ip[0]);
+		a1 += BSWAP_64(ip[1]);
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/lzjb.c b/usr/src/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 0000000000..5979a55ef7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This is stolen from common/os/compress.c and will be removed once
+ * our changes have made it into the on10 source base.
+ *
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/types.h>
+
+#define	MATCH_BITS	6
+#define	MATCH_MIN	3
+#define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
+#define	LEMPEL_SIZE	256
+
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *cpy, *copymap;
+	int copymask = 1 << (NBBY - 1);
+	int mlen, offset;
+	uint16_t *hp;
+	uint16_t lempel[LEMPEL_SIZE];	/* uninitialized; see above */
+
+	while (src < (uchar_t *)s_start + s_len) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+				if (d_len != s_len)
+					return (s_len);
+				mlen = s_len;
+				for (src = s_start, dst = d_start; mlen; mlen--)
+					*dst++ = *src++;
+				return (s_len);
+			}
+			copymask = 1;
+			copymap = dst;
+			*dst++ = 0;
+		}
+		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+			*dst++ = *src++;
+			continue;
+		}
+		hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+		    (LEMPEL_SIZE - 1)];
+		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+		*hp = (uint16_t)(uintptr_t)src;
+		cpy = src - offset;
+		if (cpy >= (uchar_t *)s_start && cpy != src &&
+		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+			*copymap |= copymask;
+			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+				if (src[mlen] != cpy[mlen])
+					break;
+			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+			    (offset >> NBBY);
+			*dst++ = (uchar_t)offset;
+			src += mlen;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+	return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *d_end = (uchar_t *)d_start + d_len;
+	uchar_t *cpy, copymap;
+	int copymask = 1 << (NBBY - 1);
+
+	while (dst < d_end) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			copymask = 1;
+			copymap = *src++;
+		}
+		if (copymap & copymask) {
+			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+			src += 2;
+			if ((cpy = dst - offset) < (uchar_t *)d_start)
+				return (-1);
+			while (--mlen >= 0 && dst < d_end)
+				*dst++ = *cpy++;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 0000000000..9d682e4990
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+	metaslab_class_t *mc;
+
+	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+	mc->mc_rotor = NULL;
+
+	return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+
+	while ((mg = mc->mc_rotor) != NULL) {
+		metaslab_class_remove(mc, mg);
+		metaslab_group_destroy(mg);
+	}
+
+	kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(mg->mg_class == NULL);
+
+	if ((mgprev = mc->mc_rotor) == NULL) {
+		mg->mg_prev = mg;
+		mg->mg_next = mg;
+	} else {
+		mgnext = mgprev->mg_next;
+		mg->mg_prev = mgprev;
+		mg->mg_next = mgnext;
+		mgprev->mg_next = mg;
+		mgnext->mg_prev = mg;
+	}
+	mc->mc_rotor = mg;
+	mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(mg->mg_class == mc);
+
+	mgprev = mg->mg_prev;
+	mgnext = mg->mg_next;
+
+	if (mg == mgnext) {
+		mc->mc_rotor = NULL;
+	} else {
+		mc->mc_rotor = mgnext;
+		mgprev->mg_next = mgnext;
+		mgnext->mg_prev = mgprev;
+	}
+
+	mg->mg_prev = NULL;
+	mg->mg_next = NULL;
+	mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+	const metaslab_t *m1 = x1;
+	const metaslab_t *m2 = x2;
+
+	if (m1->ms_weight < m2->ms_weight)
+		return (1);
+	if (m1->ms_weight > m2->ms_weight)
+		return (-1);
+
+	/*
+	 * If the weights are identical, use the offset to force uniqueness.
+	 */
+	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+		return (-1);
+	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+		return (1);
+
+	ASSERT3P(m1, ==, m2);
+
+	return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+	metaslab_group_t *mg;
+
+	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+	mg->mg_aliquot = 2ULL << 20;		/* XXX -- tweak me */
+	mg->mg_vd = vd;
+	metaslab_class_add(mc, mg);
+
+	return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+	avl_destroy(&mg->mg_metaslab_tree);
+	mutex_destroy(&mg->mg_lock);
+	kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == NULL);
+	msp->ms_group = mg;
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_group = NULL;
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+	mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+void
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
+	uint64_t start, uint64_t size, uint64_t txg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_t *msp;
+	int fm;
+
+	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+
+	msp->ms_smo = smo;
+
+	space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
+	    &msp->ms_lock);
+
+	for (fm = 0; fm < TXG_SIZE; fm++) {
+		space_map_create(&msp->ms_allocmap[fm], start, size,
+		    vd->vdev_ashift, &msp->ms_lock);
+		space_map_create(&msp->ms_freemap[fm], start, size,
+		    vd->vdev_ashift, &msp->ms_lock);
+	}
+
+	/*
+	 * If we're opening an existing pool (txg == 0) or creating
+	 * a new one (txg == TXG_INITIAL), all space is available now.
+	 * If we're adding space to an existing pool, the new space
+	 * does not become available until after this txg has synced.
+	 * We enforce this by assigning an initial weight of 0 to new space.
+	 *
+	 * (Transactional allocations for this txg would actually be OK;
+	 * it's intent log allocations that cause trouble.  If we wrote
+	 * a log block in this txg and lost power, the log replay would be
+	 * based on the DVA translations that had been synced in txg - 1.
+	 * Those translations would not include this metaslab's vdev.)
+	 */
+	metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
+
+	if (txg == 0) {
+		/*
+		 * We're opening the pool.  Make the metaslab's
+		 * free space available immediately.
+		 */
+		vdev_space_update(vd, size, smo->smo_alloc);
+		metaslab_sync_done(msp, 0);
+	} else {
+		/*
+		 * We're adding a new metaslab to an already-open pool.
+		 * Declare all of the metaslab's space to be free.
+		 *
+		 * Note that older transaction groups cannot allocate
+		 * from this metaslab until its existence is committed,
+		 * because we set ms_last_alloc to the current txg.
+		 */
+		smo->smo_alloc = 0;
+		msp->ms_usable_space = size;
+		mutex_enter(&msp->ms_lock);
+		space_map_add(&msp->ms_map, start, size);
+		msp->ms_map_incore = 1;
+		mutex_exit(&msp->ms_lock);
+
+		/* XXX -- we'll need a call to picker_init here */
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
+		msp->ms_last_alloc = txg;
+		vdev_dirty(vd, VDD_ADD, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	*mspp = msp;
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+	int fm;
+	metaslab_group_t *mg = msp->ms_group;
+
+	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+	    -msp->ms_smo->smo_alloc);
+
+	metaslab_group_remove(mg, msp);
+
+	/* XXX -- we'll need a call to picker_fini here */
+
+	mutex_enter(&msp->ms_lock);
+
+	space_map_vacate(&msp->ms_map, NULL, NULL);
+	msp->ms_map_incore = 0;
+	space_map_destroy(&msp->ms_map);
+
+	for (fm = 0; fm < TXG_SIZE; fm++) {
+		space_map_destroy(&msp->ms_allocmap[fm]);
+		space_map_destroy(&msp->ms_freemap[fm]);
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	kmem_free(msp, sizeof (metaslab_t));
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *os = spa->spa_meta_objset;
+	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_obj_t *smo = msp->ms_smo;
+	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+	uint64_t alloc_delta;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+
+	dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
+
+	mutex_enter(&msp->ms_lock);
+
+	if (*dirty & MSD_ADD)
+		vdev_space_update(vd, msp->ms_map.sm_size, 0);
+
+	if (*dirty & (MSD_ALLOC | MSD_FREE)) {
+		tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+		if (smo->smo_object == 0) {
+			ASSERT(smo->smo_objsize == 0);
+			ASSERT(smo->smo_alloc == 0);
+			smo->smo_object = dmu_object_alloc(os,
+			    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+			    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+			ASSERT(smo->smo_object != 0);
+			dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
+			    (msp->ms_map.sm_start >> vd->vdev_ms_shift),
+			    sizeof (uint64_t), &smo->smo_object, tx);
+		}
+
+		alloc_delta = allocmap->sm_space - freemap->sm_space;
+		vdev_space_update(vd, 0, alloc_delta);
+		smo->smo_alloc += alloc_delta;
+
+		if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
+		    (*dirty & MSD_CONDENSE) == 0) {
+			space_map_t *sm = &msp->ms_map;
+			space_map_t *tsm;
+			int i;
+
+			ASSERT(msp->ms_map_incore);
+
+			space_map_merge(freemap, freed_map);
+			space_map_vacate(allocmap, NULL, NULL);
+
+			/*
+			 * Write out the current state of the allocation
+			 * world.  The current metaslab is full, minus
+			 * stuff that's been freed this txg (freed_map),
+			 * minus allocations from txgs in the future.
+			 */
+			space_map_add(sm, sm->sm_start, sm->sm_size);
+			for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
+				tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
+				space_map_iterate(tsm, space_map_remove, sm);
+			}
+			space_map_iterate(freed_map, space_map_remove, sm);
+
+			space_map_write(sm, smo, os, tx);
+
+			ASSERT(sm->sm_space == 0);
+			ASSERT(freemap->sm_space == 0);
+			ASSERT(allocmap->sm_space == 0);
+
+			*dirty |= MSD_CONDENSE;
+		} else {
+			space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
+			space_map_sync(freemap, freed_map, smo, SM_FREE,
+			    os, tx);
+		}
+
+		db = dmu_bonus_hold(os, smo->smo_object);
+		dmu_buf_will_dirty(db, tx);
+		ASSERT3U(db->db_size, ==, sizeof (*smo));
+		bcopy(smo, db->db_data, db->db_size);
+		dmu_buf_rele(db);
+
+		dmu_tx_commit(tx);
+	}
+
+	*dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
+
+	mutex_exit(&msp->ms_lock);
+
+	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+	uint64_t weight;
+	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+	space_map_obj_t *smo = msp->ms_smo;
+
+	dprintf("%s offset %llx txg %llu\n",
+	    vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
+
+	mutex_enter(&msp->ms_lock);
+
+	ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
+
+	msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
+	msp->ms_usable_end = smo->smo_objsize;
+
+	weight = msp->ms_usable_space;
+
+	if (txg != 0) {
+		space_map_t *freed_map =
+		    &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+
+		/* XXX -- we'll need a call to picker_fini here */
+
+		/* If we're empty, don't bother sticking around */
+		if (msp->ms_usable_space == 0) {
+			space_map_vacate(&msp->ms_map, NULL, NULL);
+			msp->ms_map_incore = 0;
+			ASSERT3U(freed_map->sm_space, ==, 0);
+			weight = 0;
+		} else {
+			/* Add the freed blocks to the available space map */
+			if (msp->ms_map_incore)
+				space_map_merge(freed_map, &msp->ms_map);
+			else
+				space_map_vacate(freed_map, NULL, NULL);
+			weight += msp->ms_map.sm_size;
+		}
+
+		if (msp->ms_last_alloc == txg)
+			/* Safe to use for allocation now */
+			msp->ms_last_alloc = 0;
+
+		*dirty = 0;
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	metaslab_group_sort(msp->ms_group, msp, weight);
+}
+
+/*
+ * The first-fit block picker.  No picker_init or picker_fini,
+ * this is just an experiment to see how it feels to separate out
+ * the block selection policy from the map updates.
+ * Note: the 'cursor' argument is a form of PPD.
+ */
+static uint64_t
+metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	space_seg_t *ss, ssearch;
+	avl_index_t where;
+	int tried_once = 0;
+
+again:
+	ssearch.ss_start = *cursor;
+	ssearch.ss_end = *cursor + size;
+
+	ss = avl_find(t, &ssearch, &where);
+	if (ss == NULL)
+		ss = avl_nearest(t, where, AVL_AFTER);
+
+	while (ss != NULL) {
+		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+		if (offset + size <= ss->ss_end) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		ss = AVL_NEXT(t, ss);
+	}
+
+	/* If we couldn't find a block after cursor, search again */
+	if (tried_once == 0) {
+		tried_once = 1;
+		*cursor = 0;
+		goto again;
+	}
+
+	return (-1ULL);
+}
+
+static uint64_t
+metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	space_map_t *sm = &msp->ms_map;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t offset;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_map_incore);
+	ASSERT(sm->sm_space != 0);
+	ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
+
+	offset = metaslab_pick_block(sm, size,
+	    &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
+	if (offset != -1ULL) {
+		space_map_remove(sm, offset, size);
+		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	}
+	return (offset);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+int
+metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	objset_t *os = spa->spa_meta_objset;
+	vdev_t *vd;
+	metaslab_t *msp;
+	space_map_t *sm;
+	space_map_obj_t *smo;
+	int error;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+		return (ENXIO);
+
+	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+		return (ENXIO);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	sm = &msp->ms_map;
+	smo = msp->ms_smo;
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	mutex_enter(&msp->ms_lock);
+
+	if (msp->ms_map_incore == 0) {
+		error = space_map_load(sm, smo, SM_FREE, os,
+		    msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
+		ASSERT(error == 0);
+		if (error) {
+			mutex_exit(&msp->ms_lock);
+			return (error);
+		}
+		msp->ms_map_incore = 1;
+		/* XXX -- we'll need a call to picker_init here */
+		bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+	}
+
+	space_map_remove(sm, offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+		msp->ms_last_alloc = txg;
+		vdev_dirty(vd, VDD_ALLOC, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	return (0);
+}
+
+static int
+metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	/*
+	 * Enforce segregation across transaction groups.
+	 */
+	/* XXX -- We should probably not assume we know what ms_weight means */
+	if (msp->ms_last_alloc == txg)
+		return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
+
+	if (msp->ms_last_alloc != 0)
+		return (0);
+
+	if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
+		return (1);
+
+	/* XXX -- the weight test should be in terms of MINFREE */
+	return (msp->ms_usable_space >= size && msp->ms_weight >= size);
+}
+
+static metaslab_t *
+metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
+{
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+
+	mutex_enter(&mg->mg_lock);
+	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
+		if (metaslab_usable(msp, size, txg))
+			break;
+	mutex_exit(&mg->mg_lock);
+
+	return (msp);
+}
+
+static metaslab_t *
+metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
+    uint64_t *offp, uint64_t txg)
+{
+	metaslab_t *msp;
+	int error;
+
+	while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
+		space_map_obj_t *smo = msp->ms_smo;
+		mutex_enter(&msp->ms_lock);
+		if (!metaslab_usable(msp, size, txg)) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+		if (msp->ms_map_incore == 0) {
+			error = space_map_load(&msp->ms_map, smo, SM_FREE,
+			    spa->spa_meta_objset, msp->ms_usable_end,
+			    msp->ms_map.sm_size - msp->ms_usable_space);
+			ASSERT(error == 0);
+			if (error) {
+				mutex_exit(&msp->ms_lock);
+				metaslab_group_sort(mg, msp, 0);
+				continue;
+			}
+			msp->ms_map_incore = 1;
+			/* XXX -- we'll need a call to picker_init here */
+			bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+		}
+		*offp = metaslab_getblock(msp, size, txg);
+		if (*offp != -1ULL) {
+			if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+				vdev_t *vd = mg->mg_vd;
+				msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+				msp->ms_last_alloc = txg;
+				vdev_dirty(vd, VDD_ALLOC, txg);
+				(void) txg_list_add(&vd->vdev_ms_list,
+				    msp, txg);
+			}
+			mutex_exit(&msp->ms_lock);
+			return (msp);
+		}
+		mutex_exit(&msp->ms_lock);
+		metaslab_group_sort(msp->ms_group, msp, size - 1);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+{
+	metaslab_t *msp;
+	metaslab_group_t *mg, *rotor;
+	metaslab_class_t *mc;
+	vdev_t *vd;
+	uint64_t offset = -1ULL;
+	uint64_t asize;
+
+	mc = spa_metaslab_class_select(spa);
+
+	/*
+	 * Start at the rotor and loop through all mgs until we find something.
+	 * Note that there's no locking on mc_rotor or mc_allocated because
+	 * nothing actually breaks if we miss a few updates -- we just won't
+	 * allocate quite as evenly.  It all balances out over time.
+	 */
+	mg = rotor = mc->mc_rotor;
+	do {
+		vd = mg->mg_vd;
+		asize = vdev_psize_to_asize(vd, psize);
+		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+		msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
+		if (msp != NULL) {
+			ASSERT(offset != -1ULL);
+
+			/*
+			 * If we've just selected this metaslab group,
+			 * figure out whether the corresponding vdev is
+			 * over- or under-used relative to the pool,
+			 * and set an allocation bias to even it out.
+			 */
+			if (mc->mc_allocated == 0) {
+				vdev_stat_t *vs = &vd->vdev_stat;
+				uint64_t alloc, space;
+				int64_t vu, su;
+
+				alloc = spa_get_alloc(spa);
+				space = spa_get_space(spa);
+
+				/*
+				 * Determine percent used in units of 0..1024.
+				 * (This is just to avoid floating point.)
+				 */
+				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+				su = (alloc << 10) / (space + 1);
+
+				/*
+				 * Bias by at most +/- 25% of the aliquot.
+				 */
+				mg->mg_bias = ((su - vu) *
+				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+
+				dprintf("bias = %lld\n", mg->mg_bias);
+			}
+
+			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+			    mg->mg_aliquot + mg->mg_bias) {
+				mc->mc_rotor = mg->mg_next;
+				mc->mc_allocated = 0;
+			}
+
+			DVA_SET_VDEV(dva, vd->vdev_id);
+			DVA_SET_OFFSET(dva, offset);
+			DVA_SET_GANG(dva, 0);
+			DVA_SET_ASIZE(dva, asize);
+
+			return (0);
+		}
+		mc->mc_rotor = mg->mg_next;
+		mc->mc_allocated = 0;
+	} while ((mg = mg->mg_next) != rotor);
+
+	dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
+
+	DVA_SET_VDEV(dva, 0);
+	DVA_SET_OFFSET(dva, 0);
+	DVA_SET_GANG(dva, 0);
+
+	return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+void
+metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd;
+	metaslab_t *msp;
+
+	if (txg > spa_freeze_txg(spa))
+		return;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
+		    (u_longlong_t)vdev);
+		ASSERT(0);
+		return;
+	}
+
+	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
+		    (u_longlong_t)offset);
+		ASSERT(0);
+		return;
+	}
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	mutex_enter(&msp->ms_lock);
+
+	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
+		vdev_dirty(vd, VDD_FREE, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+	mutex_exit(&msp->ms_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
new file mode 100644
index 0000000000..411ed46e13
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+	reference_cache = kmem_cache_create("reference_cache",
+	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	reference_history_cache = kmem_cache_create("reference_history_cache",
+	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+	kmem_cache_destroy(reference_cache);
+	kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+	list_create(&rc->rc_list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&rc->rc_removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+	reference_t *ref;
+
+	ASSERT(rc->rc_count == number);
+	while (ref = list_head(&rc->rc_list)) {
+		list_remove(&rc->rc_list, ref);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_list);
+
+	while (ref = list_head(&rc->rc_removed)) {
+		list_remove(&rc->rc_removed, ref);
+		kmem_cache_free(reference_history_cache, ref->ref_removed);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_removed);
+	mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+	refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+	ASSERT(rc->rc_count >= 0);
+	return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+	ASSERT(rc->rc_count >= 0);
+	return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+	reference_t *ref;
+	int64_t count;
+
+	if (reference_tracking_enable) {
+		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+		ref->ref_holder = holder;
+		ref->ref_number = number;
+	}
+	mutex_enter(&rc->rc_mtx);
+	ASSERT(rc->rc_count >= 0);
+	if (reference_tracking_enable)
+		list_insert_head(&rc->rc_list, ref);
+	rc->rc_count += number;
+	count = rc->rc_count;
+	mutex_exit(&rc->rc_mtx);
+
+	return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+	return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+	reference_t *ref;
+	int64_t count;
+
+	mutex_enter(&rc->rc_mtx);
+	ASSERT(rc->rc_count >= number);
+
+	if (!reference_tracking_enable) {
+		rc->rc_count -= number;
+		count = rc->rc_count;
+		mutex_exit(&rc->rc_mtx);
+		return (count);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder && ref->ref_number == number) {
+			list_remove(&rc->rc_list, ref);
+			if (reference_history > 0) {
+				ref->ref_removed =
+				    kmem_cache_alloc(reference_history_cache,
+				    KM_SLEEP);
+				list_insert_head(&rc->rc_removed, ref);
+				rc->rc_removed_count++;
+				if (rc->rc_removed_count >= reference_history) {
+					ref = list_tail(&rc->rc_removed);
+					list_remove(&rc->rc_removed, ref);
+					kmem_cache_free(reference_history_cache,
+					    ref->ref_removed);
+					kmem_cache_free(reference_cache, ref);
+					rc->rc_removed_count--;
+				}
+			} else {
+				kmem_cache_free(reference_cache, ref);
+			}
+			rc->rc_count -= number;
+			count = rc->rc_count;
+			mutex_exit(&rc->rc_mtx);
+			return (count);
+		}
+	}
+	panic("No such hold %p on refcount %llx", holder,
+	    (u_longlong_t)(uintptr_t)rc);
+	return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+	return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c
new file mode 100644
index 0000000000..ce5c26131a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * 	Ch(x, y, z)     (((x) & (y)) ^ ((~(x)) & (z)))
+ * 	Maj(x, y, z)    (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
+#define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define	Rot32(x, s)	(((x) >> s) | ((x) << (32 - s)))
+#define	SIGMA0(x)	(Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define	SIGMA1(x)	(Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define	sigma0(x)	(Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define	sigma1(x)	(Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+	for (t = 0; t < 16; t++, cp += 4)
+		W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+	for (t = 16; t < 64; t++)
+		W[t] = sigma1(W[t - 2]) + W[t - 7] +
+		    sigma0(W[t - 15]) + W[t - 16];
+
+	a = H[0]; b = H[1]; c = H[2]; d = H[3];
+	e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+	for (t = 0; t < 64; t++) {
+		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+		T2 = SIGMA0(a) + Maj(a, b, c);
+		h = g; g = f; f = e; e = d + T1;
+		d = c; c = b; b = a; a = T1 + T2;
+	}
+
+	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+	uint8_t pad[128];
+	int padsize = size & 63;
+	int i;
+
+	for (i = 0; i < size - padsize; i += 64)
+		SHA256Transform(H, (uint8_t *)buf + i);
+
+	for (i = 0; i < padsize; i++)
+		pad[i] = ((uint8_t *)buf)[i];
+
+	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+		pad[padsize] = 0;
+
+	for (i = 0; i < 8; i++)
+		pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+	for (i = 0; i < padsize; i += 64)
+		SHA256Transform(H, pad + i);
+
+	ZIO_SET_CHECKSUM(zcp,
+	    (uint64_t)H[0] << 32 | H[1],
+	    (uint64_t)H[2] << 32 | H[3],
+	    (uint64_t)H[4] << 32 | H[5],
+	    (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
new file mode 100644
index 0000000000..43112d9319
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -0,0 +1,1784 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/callb.h>
+
+static uint32_t spa_active_count;
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+	int t;
+
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+	spa->spa_state = POOL_STATE_ACTIVE;
+
+	spa->spa_normal_class = metaslab_class_create();
+
+	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
+	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
+	for (t = 0; t < ZIO_TYPES; t++) {
+		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+		    8, maxclsyspri, 50, INT_MAX,
+		    TASKQ_PREPOPULATE);
+		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+		    8, maxclsyspri, 50, INT_MAX,
+		    TASKQ_PREPOPULATE);
+	}
+
+	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_dirty_node));
+
+	txg_list_create(&spa->spa_vdev_txg_list,
+	    offsetof(struct vdev, vdev_txg_node));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+	int t;
+
+	ASSERT(spa->spa_sync_on == B_FALSE);
+	ASSERT(spa->spa_dsl_pool == NULL);
+	ASSERT(spa->spa_root_vdev == NULL);
+
+	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+	txg_list_destroy(&spa->spa_vdev_txg_list);
+
+	list_destroy(&spa->spa_dirty_list);
+
+	rw_destroy(&spa->spa_traverse_lock);
+
+	for (t = 0; t < ZIO_TYPES; t++) {
+		taskq_destroy(spa->spa_zio_issue_taskq[t]);
+		taskq_destroy(spa->spa_zio_intr_taskq[t]);
+		spa->spa_zio_issue_taskq[t] = NULL;
+		spa->spa_zio_intr_taskq[t] = NULL;
+	}
+
+	taskq_destroy(spa->spa_vdev_retry_taskq);
+	spa->spa_vdev_retry_taskq = NULL;
+
+	metaslab_class_destroy(spa->spa_normal_class);
+	spa->spa_normal_class = NULL;
+
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately.  This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state.  This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static vdev_t *
+spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_t *vd;
+
+	if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL)
+		return (NULL);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (vd);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		vdev_free(vd);
+		return (NULL);
+	}
+
+	for (c = 0; c < children; c++) {
+		if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) {
+			vdev_free(vd);
+			return (NULL);
+		}
+	}
+
+	return (vd);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+	/*
+	 * Stop syncing.
+	 */
+	if (spa->spa_sync_on) {
+		txg_sync_stop(spa->spa_dsl_pool);
+		spa->spa_sync_on = B_FALSE;
+	}
+
+	/*
+	 * Wait for any outstanding prefetch I/O to complete.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	spa_config_exit(spa);
+
+	/*
+	 * Close the dsl pool.
+	 */
+	if (spa->spa_dsl_pool) {
+		dsl_pool_close(spa->spa_dsl_pool);
+		spa->spa_dsl_pool = NULL;
+	}
+
+	/*
+	 * Close all vdevs.
+	 */
+	if (spa->spa_root_vdev) {
+		vdev_free(spa->spa_root_vdev);
+		spa->spa_root_vdev = NULL;
+	}
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.  The 'readonly' flag will prevent us
+ * from writing any updated state to disk, and can be use when testing a pool
+ * for import.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+{
+	int error = 0;
+	nvlist_t *nvroot = NULL;
+	vdev_t *rvd;
+	uberblock_t *ub = &spa->spa_uberblock;
+	uint64_t pool_guid;
+	zio_t *zio;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+		return (EINVAL);
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg);
+
+	if (import && spa_guid_exists(pool_guid, 0))
+		return (EEXIST);
+
+	/*
+	 * Parse the configuration into a vdev tree.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+	spa_config_exit(spa);
+
+	if (rvd == NULL)
+		return (EINVAL);
+
+	spa->spa_root_vdev = rvd;
+	ASSERT(spa_guid(spa) == pool_guid);
+
+	/*
+	 * Try to open all vdevs, loading each label in the process.
+	 */
+	if (vdev_open(rvd) != 0)
+		return (ENXIO);
+
+	/*
+	 * Find the best uberblock.
+	 */
+	bzero(ub, sizeof (uberblock_t));
+
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	vdev_uberblock_load(zio, rvd, ub);
+	error = zio_wait(zio);
+
+	/*
+	 * If we weren't able to find a single valid uberblock, return failure.
+	 */
+	if (ub->ub_txg == 0) {
+		dprintf("ub_txg is zero\n");
+		return (ENXIO);
+	}
+
+	/*
+	 * If the vdev guid sum doesn't match the uberblock, we have an
+	 * incomplete configuration.
+	 */
+	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
+		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
+		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
+		    rvd->vdev_guid_sum, ub->ub_guid_sum);
+		return (ENXIO);
+	}
+
+	/*
+	 * Initialize internal SPA structures.
+	 */
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+	VERIFY(zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+
+	if (!mosconfig) {
+		dmu_buf_t *db;
+		char *packed = NULL;
+		size_t nvsize = 0;
+		nvlist_t *newconfig = NULL;
+
+		db = dmu_bonus_hold(spa->spa_meta_objset,
+		    spa->spa_config_object);
+		dmu_buf_read(db);
+		nvsize = *(uint64_t *)db->db_data;
+		dmu_buf_rele(db);
+
+		packed = kmem_alloc(nvsize, KM_SLEEP);
+		error = dmu_read_canfail(spa->spa_meta_objset,
+		    spa->spa_config_object, 0, nvsize, packed);
+		if (error == 0)
+			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
+		kmem_free(packed, nvsize);
+
+		if (error)
+			return (ENXIO);
+
+		spa_config_set(spa, newconfig);
+
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_activate(spa);
+
+		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+	}
+
+	VERIFY(zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+
+	/*
+	 * Load the vdev state for all top level vdevs.
+	 */
+	if ((error = vdev_load(rvd, import)) != 0)
+		return (error);
+
+	/*
+	 * Propagate the leaf DTLs we just loaded all the way up the tree.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+	spa_config_exit(spa);
+
+	/*
+	 * Check the state of the root vdev.  If it can't be opened, it
+	 * indicates one or more toplevel vdevs are faulted.
+	 */
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+		return (ENXIO);
+
+	/*
+	 * Claim log blocks that haven't been committed yet, and update all
+	 * top-level vdevs to sync any config changes found in vdev_load().
+	 * This must all happen in a single txg.
+	 */
+	if ((spa_mode & FWRITE) && !readonly) {
+		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+		    spa_first_txg(spa));
+		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
+		vdev_config_dirty(rvd);
+		dmu_tx_commit(tx);
+
+		spa->spa_sync_on = B_TRUE;
+		txg_sync_start(spa->spa_dsl_pool);
+
+		/*
+		 * Wait for all claims to sync.
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+	}
+
+	return (0);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache.  For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+	spa_t *spa;
+	int error;
+	int loaded = B_FALSE;
+	int locked = B_FALSE;
+
+	*spapp = NULL;
+
+	/*
+	 * As disgusting as this is, we need to support recursive calls to this
+	 * function because dsl_dir_open() is called during spa_load(), and ends
+	 * up calling spa_open() again.  The real fix is to figure out how to
+	 * avoid dsl_dir_open() calling this in the first place.
+	 */
+	if (mutex_owner(&spa_namespace_lock) != curthread) {
+		mutex_enter(&spa_namespace_lock);
+		locked = B_TRUE;
+	}
+
+	if ((spa = spa_lookup(pool)) == NULL) {
+		if (locked)
+			mutex_exit(&spa_namespace_lock);
+		return (ENOENT);
+	}
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+		spa_activate(spa);
+
+		error = spa_load(spa, spa->spa_config,
+		    B_FALSE, B_FALSE, B_FALSE);
+
+		if (error == EBADF) {
+			/*
+			 * If vdev_load() returns EBADF, it indicates that one
+			 * of the vdevs indicates that the pool has been
+			 * exported or destroyed.  If this is the case, the
+			 * config cache is out of sync and we should remove the
+			 * pool from the namespace.
+			 */
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa_remove(spa);
+			spa_config_sync();
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (ENOENT);
+		} if (error) {
+			/*
+			 * We can't open the pool, but we still have useful
+			 * information: the state of each vdev after the
+			 * attempted vdev_open().  Return this to the user.
+			 */
+			if (config != NULL && spa->spa_root_vdev != NULL)
+				*config = spa_config_generate(spa, NULL, -1ULL,
+				    B_TRUE);
+			spa_unload(spa);
+			spa_deactivate(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			*spapp = NULL;
+			return (error);
+		}
+
+		loaded = B_TRUE;
+	}
+
+	spa_open_ref(spa, tag);
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
+	*spapp = spa;
+
+	if (config != NULL) {
+		spa_config_enter(spa, RW_READER);
+		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		spa_config_exit(spa);
+	}
+
+	/*
+	 * If we just loaded the pool, resilver anything that's out of date.
+	 */
+	if (loaded && (spa_mode & FWRITE))
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+	return (spa_open_common(name, spapp, tag, NULL));
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config)
+{
+	int error;
+	spa_t *spa;
+
+	*config = NULL;
+	error = spa_open_common(name, &spa, FTAG, config);
+
+	if (spa != NULL)
+		spa_close(spa, FTAG);
+
+	return (error);
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
+{
+	spa_t *spa;
+	dsl_pool_t *dp;
+	dmu_tx_t *tx;
+	int error;
+	uint64_t txg = TXG_INITIAL;
+
+	/*
+	 * If this pool already exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+	spa = spa_add(pool);
+
+	/*
+	 * Allocate a new spa_t structure.
+	 */
+	spa_activate(spa);
+
+	spa->spa_uberblock.ub_txg = txg - 1;
+	spa->spa_ubsync = spa->spa_uberblock;
+
+	error = spa_vdev_add(spa, nvroot);
+
+	if (error) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	if (altroot != NULL) {
+		spa->spa_root = spa_strdup(altroot);
+		atomic_add_32(&spa_active_count, 1);
+	}
+
+	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+	spa->spa_meta_objset = dp->dp_meta_objset;
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	/*
+	 * Create the pool config object.
+	 */
+	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+	    DMU_OT_PACKED_NVLIST, 1 << 14,
+	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+	VERIFY(zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+
+	/*
+	 * Create the deferred-free bplist object.  Turn off compression
+	 * because sync-to-convergence takes longer if the blocksize
+	 * keeps changing.
+	 */
+	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+	    1 << 14, tx);
+	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+	    ZIO_COMPRESS_OFF, tx);
+
+	VERIFY(zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+
+	dmu_tx_commit(tx);
+
+	spa->spa_sync_on = B_TRUE;
+	txg_sync_start(spa->spa_dsl_pool);
+
+	/*
+	 * We explicitly wait for the first transaction to complete so that our
+	 * bean counters are appropriately updated.
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	spa_config_sync();
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Import the given pool into the system.  We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, char *altroot)
+{
+	spa_t *spa;
+	int error;
+
+	if (!(spa_mode & FWRITE))
+		return (EROFS);
+
+	/*
+	 * If a pool with this name exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+
+	/*
+	 * Create an initialize the spa structure
+	 */
+	spa = spa_add(pool);
+	spa_activate(spa);
+
+	/*
+	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
+	 * so that we don't try to open the pool if the config is damaged.
+	 */
+	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+
+	if (error) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	/*
+	 * Set the alternate root, if there is one.
+	 */
+	if (altroot != NULL) {
+		atomic_add_32(&spa_active_count, 1);
+		spa->spa_root = spa_strdup(altroot);
+	}
+
+	/*
+	 * Initialize the config based on the in-core state.
+	 */
+	config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0);
+
+	spa_config_set(spa, config);
+
+	/*
+	 * Sync the configuration cache.
+	 */
+	spa_config_sync();
+
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Resilver anything that's out of date.
+	 */
+	if (spa_mode & FWRITE)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+	nvlist_t *config = NULL;
+	char *poolname;
+	spa_t *spa;
+	uint64_t state;
+
+	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+		return (NULL);
+
+	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+		return (NULL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_add(TRYIMPORT_NAME);
+
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+	/*
+	 * Initialize the spa_t structure.
+	 */
+	spa_activate(spa);
+
+	/*
+	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
+	 * so we don't try to open the pool if the config is damaged.
+	 */
+	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+
+	/*
+	 * If 'tryconfig' was at least parsable, return the current config.
+	 */
+	if (spa->spa_root_vdev != NULL) {
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    poolname) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    state) == 0);
+	}
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+	spa_remove(spa);
+	mutex_exit(&spa_namespace_lock);
+
+	return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple.  We make sure there
+ * is no more pending I/O and any references to the pool are gone.  Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state)
+{
+	spa_t *spa;
+
+	if (!(spa_mode & FWRITE))
+		return (EROFS);
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pool)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (ENOENT);
+	}
+
+	/*
+	 * The pool will be in core if it's openable,
+	 * in which case we can modify its state.
+	 */
+	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+		/*
+		 * Objsets may be open only because they're dirty, so we
+		 * have to force it to sync before checking spa_refcnt.
+		 */
+		spa_scrub_suspend(spa);
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
+		if (!spa_refcount_zero(spa)) {
+			spa_scrub_resume(spa);
+			mutex_exit(&spa_namespace_lock);
+			return (EBUSY);
+		}
+
+		/*
+		 * Update the pool state.
+		 */
+		spa->spa_state = new_state;
+
+		spa_scrub_resume(spa);
+		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+		if (spa->spa_root != NULL)
+			atomic_add_32(&spa_active_count, -1);
+
+		/*
+		 * We want this to be reflected on every label,
+		 * so mark them all dirty.  spa_unload() will do the
+		 * final sync that pushes these changes out.
+		 */
+		vdev_config_dirty(spa->spa_root_vdev);
+	}
+
+	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+	}
+
+	spa_remove(spa);
+	spa_config_sync();
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_DESTROYED));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_EXPORTED));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add capacity to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+	uint64_t txg;
+	int c, error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+
+	txg = spa_vdev_enter(spa);
+
+	vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	if (vd == NULL)
+		return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+	if (rvd == NULL)			/* spa_create() */
+		spa->spa_root_vdev = rvd = vd;
+
+	if ((error = vdev_create(vd, txg)) != 0)
+		return (spa_vdev_exit(spa, vd, txg, error));
+
+	/*
+	 * Transfer each top-level vdev from the temporary root
+	 * to the spa's root and initialize its metaslabs.
+	 */
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *tvd = vd->vdev_child[c];
+		if (vd != rvd) {
+			vdev_remove_child(vd, tvd);
+			tvd->vdev_id = rvd->vdev_children;
+			vdev_add_child(rvd, tvd);
+		}
+		vdev_init(tvd, txg);
+		vdev_config_dirty(tvd);
+	}
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * Attach a device to a mirror.  The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device.  If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally idendical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+{
+	uint64_t txg, open_txg;
+	int error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+	vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops;
+
+	txg = spa_vdev_enter(spa);
+
+	oldvd = vdev_lookup_by_path(rvd, path);
+
+	if (oldvd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	pvd = oldvd->vdev_parent;
+
+	/*
+	 * The parent must be a mirror or the root, unless we're replacing;
+	 * in that case, the parent can be anything but another replacing vdev.
+	 */
+	if (pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_root_ops &&
+	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	if (newrootvd == NULL || newrootvd->vdev_children != 1)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	newvd = newrootvd->vdev_child[0];
+
+	if (!newvd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	if ((error = vdev_create(newrootvd, txg)) != 0)
+		return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+	if (newvd->vdev_psize < oldvd->vdev_psize)
+		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+	if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0)
+		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+	/*
+	 * If this is an in-place replacement, update oldvd's path and devid
+	 * to make it distinguishable from newvd, and unopenable from now on.
+	 */
+	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+		spa_strfree(oldvd->vdev_path);
+		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		    KM_SLEEP);
+		(void) sprintf(oldvd->vdev_path, "%s/%s",
+		    newvd->vdev_path, "old");
+		if (oldvd->vdev_devid != NULL) {
+			spa_strfree(oldvd->vdev_devid);
+			oldvd->vdev_devid = NULL;
+		}
+	}
+
+	/*
+	 * If the parent is not a mirror, or if we're replacing,
+	 * insert the new mirror/replacing vdev above oldvd.
+	 */
+	if (pvd->vdev_ops != pvops)
+		pvd = vdev_add_parent(oldvd, pvops);
+
+	ASSERT(pvd->vdev_top->vdev_parent == rvd);
+	ASSERT(pvd->vdev_ops == pvops);
+	ASSERT(oldvd->vdev_parent == pvd);
+
+	/*
+	 * Extract the new device from its root and add it to pvd.
+	 */
+	vdev_remove_child(newrootvd, newvd);
+	newvd->vdev_id = pvd->vdev_children;
+	vdev_add_child(pvd, newvd);
+
+	tvd = newvd->vdev_top;
+	ASSERT(pvd->vdev_top == tvd);
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
+	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+	 */
+	open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+	mutex_enter(&newvd->vdev_dtl_lock);
+	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+	    open_txg - TXG_INITIAL + 1);
+	mutex_exit(&newvd->vdev_dtl_lock);
+
+	/*
+	 * Mark newvd's DTL dirty in this txg.
+	 */
+	vdev_dirty(tvd, VDD_DTL, txg);
+	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
+
+	dprintf("attached %s, replacing=%d\n", path, replacing);
+
+	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+	/*
+	 * Kick off a resilver to update newvd.
+	 */
+	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+{
+	uint64_t txg;
+	int c, t, error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd, *pvd, *cvd, *tvd;
+
+	txg = spa_vdev_enter(spa);
+
+	vd = vdev_lookup_by_path(rvd, path);
+
+	if (vd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	if (guid != 0 && vd->vdev_guid != guid)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	pvd = vd->vdev_parent;
+
+	/*
+	 * If replace_done is specified, only remove this device if it's
+	 * the first child of a replacing vdev.
+	 */
+	if (replace_done &&
+	    (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops))
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	/*
+	 * Only mirror and replacing vdevs support detach.
+	 */
+	if (pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_mirror_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	/*
+	 * If there's only one replica, you can't detach it.
+	 */
+	if (pvd->vdev_children <= 1)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
+	 * If all siblings have non-empty DTLs, this device may have the only
+	 * valid copy of the data, which means we cannot safely detach it.
+	 *
+	 * XXX -- as in the vdev_offline() case, we really want a more
+	 * precise DTL check.
+	 */
+	for (c = 0; c < pvd->vdev_children; c++) {
+		uint64_t dirty;
+
+		cvd = pvd->vdev_child[c];
+		if (cvd == vd)
+			continue;
+		if (vdev_is_dead(cvd))
+			continue;
+		mutex_enter(&cvd->vdev_dtl_lock);
+		dirty = cvd->vdev_dtl_map.sm_space |
+		    cvd->vdev_dtl_scrub.sm_space;
+		mutex_exit(&cvd->vdev_dtl_lock);
+		if (!dirty)
+			break;
+	}
+	if (c == pvd->vdev_children)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
+	 * Erase the disk labels so the disk can be used for other things.
+	 * This must be done after all other error cases are handled,
+	 * but before we disembowel vd (so we can still do I/O to it).
+	 * But if we can't do it, don't treat the error as fatal --
+	 * it may be that the unwritability of the disk is the reason
+	 * it's being detached!
+	 */
+	error = vdev_label_init(vd, 0);
+	if (error)
+		dprintf("unable to erase labels on %s\n", vdev_description(vd));
+
+	/*
+	 * Remove vd from its parent and compact the parent's children.
+	 */
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	/*
+	 * Remember one of the remaining children so we can get tvd below.
+	 */
+	cvd = pvd->vdev_child[0];
+
+	/*
+	 * If the parent mirror/replacing vdev only has one child,
+	 * the parent is no longer needed.  Remove it from the tree.
+	 */
+	if (pvd->vdev_children == 1)
+		vdev_remove_parent(cvd);
+
+	/*
+	 * We don't set tvd until now because the parent we just removed
+	 * may have been the previous top-level vdev.
+	 */
+	tvd = cvd->vdev_top;
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Reopen this top-level vdev to reassess health after detach.
+	 */
+	vdev_reopen(tvd, NULL);
+
+	/*
+	 * If the device we just detached was smaller than the others,
+	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 */
+	vdev_metaslab_init(tvd, txg);
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Mark vd's DTL as dirty in this txg.
+	 * vdev_dtl_sync() will see that vd->vdev_detached is set
+	 * and free vd's DTL object in syncing context.
+	 * But first make sure we're not on any *other* txg's DTL list,
+	 * to prevent vd from being accessed after it's freed.
+	 */
+	vdev_dirty(tvd, VDD_DTL, txg);
+	vd->vdev_detached = B_TRUE;
+	for (t = 0; t < TXG_SIZE; t++)
+		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+
+	dprintf("detached %s\n", path);
+
+	return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * If there are any replacing vdevs that have finished replacing, detach them.
+ * We can't hold the config lock across detaches, so we lock the config,
+ * build a list of candidates, unlock the config, and try each candidate.
+ */
+typedef struct vdev_detach_link {
+	char		*vdl_path;
+	uint64_t	vdl_guid;
+	list_node_t	vdl_node;
+} vdev_detach_link_t;
+
+static void
+spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+
+	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+		vdev_t *cvd0 = vd->vdev_child[0];
+		vdev_t *cvd1 = vd->vdev_child[1];
+		vdev_detach_link_t *vdl;
+		int dirty1;
+
+		mutex_enter(&cvd1->vdev_dtl_lock);
+		dirty1 = cvd1->vdev_dtl_map.sm_space |
+		    cvd1->vdev_dtl_scrub.sm_space;
+		mutex_exit(&cvd1->vdev_dtl_lock);
+
+		if (!dirty1) {
+			vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
+			vdl->vdl_path = spa_strdup(cvd0->vdev_path);
+			vdl->vdl_guid = cvd0->vdev_guid;
+			list_insert_tail(l, vdl);
+		}
+	}
+}
+
+void
+spa_vdev_replace_done(spa_t *spa)
+{
+	vdev_detach_link_t *vdl;
+	list_t vdlist;
+
+	list_create(&vdlist, sizeof (vdev_detach_link_t),
+	    offsetof(vdev_detach_link_t, vdl_node));
+
+	spa_config_enter(spa, RW_READER);
+	spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
+	spa_config_exit(spa);
+
+	while ((vdl = list_head(&vdlist)) != NULL) {
+		list_remove(&vdlist, vdl);
+		(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
+		    B_TRUE);
+		spa_strfree(vdl->vdl_path);
+		kmem_free(vdl, sizeof (*vdl));
+	}
+
+	list_destroy(&vdlist);
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	if (zio->io_error)
+		spa->spa_scrub_errors++;
+	if (--spa->spa_scrub_inflight == 0)
+		cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+
+	if (zio->io_error) {
+		vdev_t *vd = zio->io_vd;
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+{
+	size_t size = BP_GET_LSIZE(bp);
+	void *data = zio_buf_alloc(size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(NULL, spa, bp, data, size,
+	    spa_scrub_io_done, NULL, priority, flags));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+	blkptr_t *bp = &bc->bc_blkptr;
+	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+
+	if (bc->bc_errno || vd == NULL) {
+		/*
+		 * We can't scrub this block, but we can continue to scrub
+		 * the rest of the pool.  Note the error and move along.
+		 */
+		mutex_enter(&spa->spa_scrub_lock);
+		spa->spa_scrub_errors++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		if (vd != NULL) {
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_scrub_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+
+		return (ERESTART);
+	}
+
+	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+	/*
+	 * Keep track of how much data we've examined so that
+	 * zpool(1M) status can make useful progress reports.
+	 */
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
+	mutex_exit(&vd->vdev_stat_lock);
+
+	if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+		if (DVA_GET_GANG(&bp->blk_dva[0])) {
+			/*
+			 * Gang members may be spread across multiple vdevs,
+			 * so the best we can do is look at the pool-wide DTL.
+			 * XXX -- it would be better to change our allocation
+			 * policy to ensure that this can't happen.
+			 */
+			vd = spa->spa_root_vdev;
+		}
+		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
+			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
+			    ZIO_FLAG_RESILVER);
+		}
+	} else {
+		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+	}
+
+	return (0);
+}
+
+static void
+spa_scrub_thread(spa_t *spa)
+{
+	callb_cpr_t cprinfo;
+	traverse_handle_t *th = spa->spa_scrub_th;
+	vdev_t *rvd = spa->spa_root_vdev;
+	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+	int error = 0;
+	boolean_t complete;
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+	spa_config_enter(spa, RW_WRITER);
+	vdev_reopen(rvd, NULL);		/* purge all vdev caches */
+	vdev_config_dirty(rvd);		/* rewrite all disk labels */
+	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+	spa_config_exit(spa);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_errors = 0;
+	spa->spa_scrub_active = 1;
+
+	while (!spa->spa_scrub_stop) {
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		while (spa->spa_scrub_suspend) {
+			spa->spa_scrub_active = 0;
+			cv_broadcast(&spa->spa_scrub_cv);
+			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+			spa->spa_scrub_active = 1;
+		}
+		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+		if (spa->spa_scrub_restart_txg != 0)
+			break;
+
+		mutex_exit(&spa->spa_scrub_lock);
+		error = traverse_more(th);
+		mutex_enter(&spa->spa_scrub_lock);
+		if (error != EAGAIN)
+			break;
+	}
+
+	while (spa->spa_scrub_inflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+	if (spa->spa_scrub_restart_txg != 0)
+		error = ERESTART;
+
+	spa->spa_scrub_active = 0;
+	cv_broadcast(&spa->spa_scrub_cv);
+
+	/*
+	 * If the traverse completed, and there were no errors,
+	 * then the scrub was completely successful.
+	 */
+	complete = (error == 0 && spa->spa_scrub_errors == 0);
+
+	dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+	mutex_exit(&spa->spa_scrub_lock);
+
+	/*
+	 * If the scrub/resilver completed, update all DTLs to reflect this.
+	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+	spa_config_exit(spa);
+
+	spa_vdev_replace_done(spa);
+
+	spa_config_enter(spa, RW_READER);
+	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+	spa_config_exit(spa);
+
+	mutex_enter(&spa->spa_scrub_lock);
+
+	spa->spa_scrub_type = POOL_SCRUB_NONE;
+	spa->spa_scrub_active = 0;
+	spa->spa_scrub_thread = NULL;
+
+	cv_broadcast(&spa->spa_scrub_cv);
+
+	/*
+	 * If we were told to restart, our final act is to start a new scrub.
+	 */
+	if (error == ERESTART)
+		VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+
+	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
+	thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_suspend++;
+	while (spa->spa_scrub_active) {
+		cv_broadcast(&spa->spa_scrub_cv);
+		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+	}
+	while (spa->spa_scrub_inflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	ASSERT(spa->spa_scrub_suspend != 0);
+	if (--spa->spa_scrub_suspend == 0)
+		cv_broadcast(&spa->spa_scrub_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+	/*
+	 * Something happened (e.g. snapshot create/delete) that means
+	 * we must restart any in-progress scrubs.  The itinerary will
+	 * fix this properly.
+	 */
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_restart_txg = txg;
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+	space_seg_t *ss;
+	uint64_t mintxg, maxtxg;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int advance = 0;
+
+	if ((uint_t)type >= POOL_SCRUB_TYPES)
+		return (ENOTSUP);
+
+	/*
+	 * If there's a scrub or resilver already in progress, stop it.
+	 */
+	while (spa->spa_scrub_thread != NULL) {
+		/*
+		 * Don't stop a resilver unless forced.
+		 */
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+			return (EBUSY);
+
+		spa->spa_scrub_stop = 1;
+		cv_broadcast(&spa->spa_scrub_cv);
+		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+	}
+
+	/*
+	 * Terminate the previous traverse.
+	 */
+	if (spa->spa_scrub_th != NULL) {
+		traverse_fini(spa->spa_scrub_th);
+		spa->spa_scrub_th = NULL;
+	}
+
+	spa->spa_scrub_stop = 0;
+	spa->spa_scrub_type = type;
+	spa->spa_scrub_restart_txg = 0;
+
+	mintxg = TXG_INITIAL - 1;
+	maxtxg = spa_last_synced_txg(spa) + 1;
+
+	switch (type) {
+
+	case POOL_SCRUB_NONE:
+		break;
+
+	case POOL_SCRUB_RESILVER:
+		/*
+		 * Determine the resilvering boundaries.
+		 *
+		 * Note: (mintxg, maxtxg) is an open interval,
+		 * i.e. mintxg and maxtxg themselves are not included.
+		 *
+		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+		 * so we don't claim to resilver a txg that's still changing.
+		 */
+		mutex_enter(&rvd->vdev_dtl_lock);
+		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+		mintxg = ss ? ss->ss_start - 1 : 0;
+		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+		maxtxg = ss ? ss->ss_end : 0;
+		maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
+		mutex_exit(&rvd->vdev_dtl_lock);
+
+		advance = ADVANCE_PRE | ADVANCE_PRUNE;
+		break;
+
+	case POOL_SCRUB_EVERYTHING:
+		/*
+		 * A scrub is like a resilver, but not pruned by DTL.
+		 */
+		advance = ADVANCE_PRE;
+		break;
+	}
+
+	if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+		spa->spa_scrub_maxtxg = maxtxg;
+		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+		    advance, ZIO_FLAG_CANFAIL);
+		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+		spa->spa_scrub_thread = thread_create(NULL, 0,
+		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+	}
+
+	return (0);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+	int error;
+	traverse_handle_t *th;
+
+	mutex_enter(&spa->spa_scrub_lock);
+	error = spa_scrub_locked(spa, type, force);
+	th = spa->spa_scrub_th;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	if (th == NULL && type != POOL_SCRUB_NONE)
+		spa_vdev_replace_done(spa);
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+	bplist_t *bpl = &spa->spa_sync_bplist;
+	dmu_tx_t *tx;
+	blkptr_t blk;
+	uint64_t itor = 0;
+	zio_t *zio;
+	int error;
+	uint8_t c = 1;
+
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+	while (bplist_iterate(bpl, &itor, &blk) == 0)
+		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+	error = zio_wait(zio);
+	ASSERT3U(error, ==, 0);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+	bplist_vacate(bpl, tx);
+
+	/*
+	 * Pre-dirty the first block so we sync to convergence faster.
+	 * (Usually only the first block is needed.)
+	 */
+	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+	dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+	nvlist_t *config;
+	char *packed = NULL;
+	size_t nvsize = 0;
+	dmu_buf_t *db;
+
+	if (list_is_empty(&spa->spa_dirty_list))
+		return;
+
+	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+	spa_config_set(spa, config);
+
+	VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0);
+
+	packed = kmem_alloc(nvsize, KM_SLEEP);
+
+	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+
+	dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
+	    packed, tx);
+
+	kmem_free(packed, nvsize);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = nvsize;
+	dmu_buf_rele(db);
+}
+
+/*
+ * Sync the specified transaction group.  New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	objset_t *mos = spa->spa_meta_objset;
+	bplist_t *bpl = &spa->spa_sync_bplist;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+	dmu_tx_t *tx;
+	int dirty_vdevs;
+
+	/*
+	 * Lock out configuration changes.
+	 */
+	spa_config_enter(spa, RW_READER);
+
+	spa->spa_syncing_txg = txg;
+	spa->spa_sync_pass = 0;
+
+	bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+
+	/*
+	 * If anything has changed in this txg, push the deferred frees
+	 * from the previous txg.  If not, leave them alone so that we
+	 * don't generate work on an otherwise idle system.
+	 */
+	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+	    !txg_list_empty(&dp->dp_dirty_dirs, txg))
+		spa_sync_deferred_frees(spa, txg);
+
+	/*
+	 * Iterate to convergence.
+	 */
+	do {
+		spa->spa_sync_pass++;
+
+		tx = dmu_tx_create_assigned(dp, txg);
+		spa_sync_config_object(spa, tx);
+		dmu_tx_commit(tx);
+
+		dsl_pool_sync(dp, txg);
+
+		dirty_vdevs = 0;
+		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+			vdev_sync(vd, txg);
+			dirty_vdevs++;
+		}
+
+		tx = dmu_tx_create_assigned(dp, txg);
+		bplist_sync(bpl, tx);
+		dmu_tx_commit(tx);
+
+	} while (dirty_vdevs);
+
+	bplist_close(bpl);
+
+	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+	/*
+	 * Rewrite the vdev configuration (which includes the uberblock)
+	 * to commit the transaction group.
+	 */
+	while (spa_sync_labels(spa, txg)) {
+		dprintf("waiting for devices to heal\n");
+		delay(hz);
+		vdev_reopen(rvd, NULL);
+	}
+
+	/*
+	 * Make a stable copy of the fully synced uberblock.
+	 * We use this as the root for pool traversals.
+	 */
+	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
+
+	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
+
+	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+	spa->spa_traverse_wanted = 0;
+	spa->spa_ubsync = spa->spa_uberblock;
+	rw_exit(&spa->spa_traverse_lock);
+
+	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
+
+	/*
+	 * Clean up the ZIL records for the synced txg.
+	 */
+	dsl_pool_zil_clean(dp);
+
+	/*
+	 * Update usable space statistics.
+	 */
+	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+		vdev_sync_done(vd, txg);
+
+	/*
+	 * It had better be the case that we didn't dirty anything
+	 * since spa_sync_labels().
+	 */
+	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+	ASSERT(bpl->bpl_queue == NULL);
+
+	spa_config_exit(spa);
+}
+
+/*
+ * Sync all pools.  We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+	spa_t *spa = NULL;
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (spa_state(spa) != POOL_STATE_ACTIVE)
+			continue;
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+int
+spa_busy(void)
+{
+	return (spa_active_count != 0);
+}
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+	spa_t *spa;
+
+	/*
+	 * Remove all cached state.  All pools should be closed now,
+	 * so every spa in the AVL tree should be unreferenced.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(NULL)) != NULL) {
+		/*
+		 * Stop all scrub and resilver activity.  spa_scrub() needs to
+		 * wait for the scrub thread, which may do a detach and sync the
+		 * configs, which needs spa_namespace_lock.  Drop the lock while
+		 * maintaining a hold on the spa_t.
+		 */
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+
+		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+			spa_unload(spa);
+			spa_deactivate(spa);
+		}
+		spa_remove(spa);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 0000000000..abcd67ddb9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,308 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+
+/*
+ * Pool configuration repository.
+ *
+ * The configuration for all pools, in addition to being stored on disk, is
+ * stored in /kernel/drv/zpool.cache as a packed nvlist.  The kernel maintains
+ * this list as pools are created, destroyed, or modified.
+ *
+ * We have a single nvlist which holds all the configuration information.  When
+ * the module loads, we read this information from the cache and populate the
+ * SPA namespace.  This namespace is maintained independently in spa.c.
+ * Whenever the namespace is modified, or the configuration of a pool is
+ * changed, we call spa_config_sync(), which walks through all the active pools
+ * and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace.  It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+	vnode_t *vp;
+	void *buf = NULL;
+	vattr_t vattr;
+	ssize_t resid;
+	nvlist_t *nvlist, *child;
+	nvpair_t *nvpair;
+	spa_t *spa;
+	char pathname[128];
+
+	/*
+	 * Open the configuration file.
+	 */
+	(void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
+	    ZPOOL_CACHE_FILE);
+	if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
+	    rootdir) != 0)
+		return;
+
+	/*
+	 * Read the nvlist from the file.
+	 */
+	if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+		goto out;
+
+	buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+
+	if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, &resid) != 0)
+		goto out;
+
+	if (resid != 0)
+		goto out;
+
+	/*
+	 * Unpack the nvlist.
+	 */
+	if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+		goto out;
+
+	/*
+	 * Iterate over all elements in the nvlist, creating a new spa_t for
+	 * each one with the specified configuration.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	nvpair = NULL;
+	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+			continue;
+
+		VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+		if (spa_lookup(nvpair_name(nvpair)) != NULL)
+			continue;
+		spa = spa_add(nvpair_name(nvpair));
+
+		/*
+		 * We blindly duplicate the configuration here.  If it's
+		 * invalid, we will catch it when the pool is first opened.
+		 */
+		VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(nvlist);
+
+out:
+	if (buf != NULL)
+		kmem_free(buf, vattr.va_size);
+
+	(void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
+	VN_RELE(vp);
+}
+
+/*
+ * Synchronize all pools to disk.  This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+	spa_t *spa = NULL;
+	nvlist_t *config;
+	size_t buflen;
+	char *buf;
+	vnode_t *vp;
+	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+	char pathname[128];
+	char pathname2[128];
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+	/*
+	 * Add all known pools to the configuration list, ignoring those with
+	 * alternate root paths.
+	 */
+	spa = NULL;
+	while ((spa = spa_next(spa)) != NULL) {
+		mutex_enter(&spa->spa_config_cache_lock);
+		if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
+			VERIFY(nvlist_add_nvlist(config, spa->spa_name,
+			    spa->spa_config) == 0);
+		mutex_exit(&spa->spa_config_cache_lock);
+	}
+
+	/*
+	 * Pack the configuration into a buffer.
+	 */
+	VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+
+	/*
+	 * Write the configuration to disk.  We need to do the traditional
+	 * 'write to temporary file, sync, move over original' to make sure we
+	 * always have a consistent view of the data.
+	 */
+	(void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
+	    ZPOOL_CACHE_TMP);
+
+	if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+		goto out;
+
+	if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+	    VOP_FSYNC(vp, FSYNC, kcred) == 0) {
+		(void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+		    spa_config_dir, ZPOOL_CACHE_FILE);
+		(void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+	}
+
+	(void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
+	VN_RELE(vp);
+
+out:
+	(void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+	spa_config_generation++;
+
+	kmem_free(buf, buflen);
+	nvlist_free(config);
+}
+
+/*
+ * Sigh.  Inside a local zone, we don't have access to /kernel/drv/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+	nvlist_t *pools;
+	spa_t *spa;
+
+	if (*generation == spa_config_generation)
+		return (NULL);
+
+	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+
+	spa = NULL;
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (INGLOBALZONE(curproc) ||
+		    zone_dataset_visible(spa_name(spa), NULL)) {
+			mutex_enter(&spa->spa_config_cache_lock);
+			VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+			    spa->spa_config) == 0);
+			mutex_exit(&spa->spa_config_cache_lock);
+		}
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	*generation = spa_config_generation;
+
+	return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+	mutex_enter(&spa->spa_config_cache_lock);
+	if (spa->spa_config != NULL)
+		nvlist_free(spa->spa_config);
+	spa->spa_config = config;
+	mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+	nvlist_t *config, *nvroot;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (vd == NULL)
+		vd = rvd;
+
+	/*
+	 * If txg is -1, report the current value of spa->spa_config_txg.
+	 * If txg is any other non-zero value, update spa->spa_config_txg.
+	 */
+	if (txg == -1ULL)
+		txg = spa->spa_config_txg;
+	else if (txg != 0 && vd == rvd)
+		spa->spa_config_txg = txg;
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+	    UBERBLOCK_VERSION) == 0);
+	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    spa_name(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    spa_state(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    txg) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    spa_guid(spa)) == 0);
+
+	if (vd != rvd) {
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+		    vd->vdev_top->vdev_guid) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid) == 0);
+		vd = vd->vdev_top;		/* label contains top config */
+	}
+
+	nvroot = vdev_config_generate(vd, getstats);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+	nvlist_free(nvroot);
+
+	return (config);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 0000000000..c1b6017509
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * 	This lock must be acquired to do any of the following:
+ *
+ * 		- Lookup a spa_t by name
+ * 		- Add or remove a spa_t from the namespace
+ * 		- Increase spa_refcount from non-zero
+ * 		- Check if spa_refcount is zero
+ * 		- Rename a spa_t
+ * 		- Held for the duration of create/destroy/import/export
+ *
+ * 	It does not need to handle recursion.  A create or destroy may
+ * 	reference objects (files or zvols) in other pools, but by
+ * 	definition they must have an existing reference, and will never need
+ * 	to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * 	This reference count keep track of any active users of the spa_t.  The
+ * 	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
+ * 	the refcount is never really 'zero' - opening a pool implicitly keeps
+ * 	some references in the DMU.  Internally we check against SPA_MINREF, but
+ * 	present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa crazy rwlock)
+ *
+ * 	This SPA special is a recursive rwlock, capable of being acquired from
+ * 	asynchronous threads.  It has protects the spa_t from config changes,
+ * 	and must be held in the following circumstances:
+ *
+ * 		- RW_READER to perform I/O to the spa
+ * 		- RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * 	This mutex prevents the spa_config nvlist from being updated.  No
+ *      other locks are required to obtain this lock, although implicitly you
+ *      must have the namespace lock or non-zero refcount to have any kind
+ *      of spa_t pointer at all.
+ *
+ * spa_vdev_lock (global mutex)
+ *
+ * 	This special lock is a global mutex used to serialize attempts to
+ * 	access devices through ZFS.  It makes sure that we do not try to add
+ * 	a single vdev to multiple pools at the same time.  It must be held
+ * 	when adding or removing a device from the pool.
+ *
+ *
+ * The locking order is fairly straightforward:
+ *
+ * 		spa_namespace_lock	->	spa_refcount
+ *
+ * 	The namespace lock must be acquired to increase the refcount from 0
+ * 	or to check if it is zero.
+ *
+ * 		spa_refcount 		->	spa_config_lock
+ *
+ * 	There must be at least one valid reference on the spa_t to acquire
+ * 	the config lock.
+ *
+ * 		spa_vdev_lock		->	spa_config_lock
+ *
+ * 	There are no locks required for spa_vdev_lock, but it must be
+ * 	acquired before spa_config_lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * 	spa_lookup()		Lookup a spa_t by name.
+ *
+ * 	spa_add()		Create a new spa_t in the namespace.
+ *
+ * 	spa_remove()		Remove a spa_t from the namespace.  This also
+ * 				frees up any memory associated with the spa_t.
+ *
+ * 	spa_next()		Returns the next spa_t in the system, or the
+ * 				first if NULL is passed.
+ *
+ * 	spa_evict_all()		Shutdown and remove all spa_t structures in
+ * 				the system.
+ *
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * 	spa_open_ref()		Adds a reference to the given spa_t.  Must be
+ * 				called with spa_namespace_lock held if the
+ * 				refcount is currently zero.
+ *
+ * 	spa_close()		Remove a reference from the spa_t.  This will
+ * 				not free the spa_t or remove it from the
+ * 				namespace.  No locking is required.
+ *
+ * 	spa_refcount_zero()	Returns true if the refcount is currently
+ * 				zero.  Must be called with spa_namespace_lock
+ * 				held.
+ *
+ * The spa_config_lock is manipulated using the following functions:
+ *
+ * 	spa_config_enter()	Acquire the config lock as RW_READER or
+ * 				RW_WRITER.  At least one reference on the spa_t
+ * 				must exist.
+ *
+ * 	spa_config_exit()	Release the config lock.
+ *
+ * 	spa_config_held()	Returns true if the config lock is currently
+ * 				held in the given state.
+ *
+ * The spa_vdev_lock, while acquired directly, is hidden by the following
+ * functions, which imply additional semantics that must be followed:
+ *
+ * 	spa_vdev_enter()	Acquire the vdev lock and the config lock for
+ * 				writing.
+ *
+ * 	spa_vdev_exit()		Release the config lock, wait for all I/O
+ * 				to complete, release the vdev lock, and sync
+ * 				the updated configs to the cache.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+int zfs_flags = ~0;
+#else
+int zfs_flags = 0;
+#endif
+
+static kmutex_t spa_vdev_lock;
+
+#define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+	spa_t search, *spa;
+	avl_index_t where;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	search.spa_name = (char *)name;
+	spa = avl_find(&spa_namespace_avl, &search, &where);
+
+	return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name.  Requires
+ * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name)
+{
+	spa_t *spa;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+	spa->spa_name = spa_strdup(name);
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+	spa->spa_freeze_txg = UINT64_MAX;
+
+	refcount_create(&spa->spa_refcount);
+
+	avl_add(&spa_namespace_avl, spa);
+
+	return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used.  Requires
+ * spa_namespace_lock.  This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+	ASSERT(spa->spa_scrub_thread == NULL);
+
+	avl_remove(&spa_namespace_avl, spa);
+	cv_broadcast(&spa_namespace_cv);
+
+	if (spa->spa_root)
+		spa_strfree(spa->spa_root);
+
+	if (spa->spa_name)
+		spa_strfree(spa->spa_name);
+
+	spa_config_set(spa, NULL);
+
+	refcount_destroy(&spa->spa_refcount);
+
+	kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none.  If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (prev)
+		return (AVL_NEXT(&spa_namespace_avl, prev));
+	else
+		return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	    MUTEX_HELD(&spa_namespace_lock));
+
+	(void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	    MUTEX_HELD(&spa_namespace_lock));
+
+	(void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero.  Must be called with
+ * spa_namespace_lock held.  We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+
+/*
+ * Acquire the config lock.  The config lock is a special rwlock that allows for
+ * recursive enters.  Because these enters come from the same thread as well as
+ * asynchronous threads working on behalf of the owner, we must unilaterally
+ * allow all reads access as long at least one reader is held (even if a write
+ * is requested).  This has the side effect of write starvation, but write locks
+ * are extremely rare, and a solution to this problem would be significantly
+ * more complex (if even possible).
+ *
+ * We would like to assert that the namespace lock isn't held, but this is a
+ * valid use during create.
+ */
+void
+spa_config_enter(spa_t *spa, krw_t rw)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	if (scl->scl_writer != curthread) {
+		if (rw == RW_READER) {
+			while (scl->scl_writer != NULL)
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+		} else {
+			while (scl->scl_writer != NULL || scl->scl_count > 0)
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+			scl->scl_writer = curthread;
+		}
+	}
+
+	scl->scl_count++;
+
+	mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Release the spa config lock, notifying any waiters in the process.
+ */
+void
+spa_config_exit(spa_t *spa)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	ASSERT(scl->scl_count > 0);
+	if (--scl->scl_count == 0) {
+		cv_broadcast(&scl->scl_cv);
+		scl->scl_writer = NULL;  /* OK in either case */
+	}
+
+	mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Returns true if the config lock is held in the given manner.
+ */
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+	boolean_t held;
+
+	mutex_enter(&scl->scl_lock);
+	if (rw == RW_WRITER)
+		held = (scl->scl_writer == curthread);
+	else
+		held = (scl->scl_count != 0);
+	mutex_exit(&scl->scl_lock);
+
+	return (held);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.  This
+ * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+	mutex_enter(&spa_vdev_lock);
+
+	spa_config_enter(spa, RW_WRITER);
+
+	return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+	spa_config_exit(spa);
+
+	if (vd == spa->spa_root_vdev) {		/* spa_create() */
+		mutex_exit(&spa_vdev_lock);
+		return (error);
+	}
+
+	/*
+	 * Note: this txg_wait_synced() is important because it ensures
+	 * that there won't be more than one config change per txg.
+	 * This allows us to use the txg as the generation number.
+	 */
+	if (error == 0)
+		txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	mutex_exit(&spa_vdev_lock);
+
+	if (vd != NULL) {
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+		vdev_free(vd);
+	}
+
+	/*
+	 * If we're in the middle of export or destroy, don't sync the
+	 * config -- it will do that anyway, and we deadlock if we try.
+	 */
+	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_sync();
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+	spa_t *spa;
+	int err;
+
+	/*
+	 * Lookup the spa_t and grab the config lock for writing.  We need to
+	 * actually open the pool so that we can sync out the necessary labels.
+	 * It's OK to call spa_open() with the namespace lock held because we
+	 * alllow recursive calls for other reasons.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if ((err = spa_open(name, &spa, FTAG)) != 0) {
+		mutex_exit(&spa_namespace_lock);
+		return (err);
+	}
+
+	spa_config_enter(spa, RW_WRITER);
+
+	avl_remove(&spa_namespace_avl, spa);
+	spa_strfree(spa->spa_name);
+	spa->spa_name = spa_strdup(newname);
+	avl_add(&spa_namespace_avl, spa);
+
+	/*
+	 * Sync all labels to disk with the new names by marking the root vdev
+	 * dirty and waiting for it to sync.  It will pick up the new pool name
+	 * during the sync.
+	 */
+	vdev_config_dirty(spa->spa_root_vdev);
+
+	spa_config_exit(spa);
+
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+
+	/*
+	 * Sync the updated config cache.
+	 */
+	spa_config_set(spa,
+	    spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0));
+	spa_config_sync();
+
+	spa_close(spa, FTAG);
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists.  If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+	spa_t *spa;
+	avl_tree_t *t = &spa_namespace_avl;
+	boolean_t locked = B_FALSE;
+
+	if (mutex_owner(&spa_namespace_lock) != curthread) {
+		mutex_enter(&spa_namespace_lock);
+		locked = B_TRUE;
+	}
+
+	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+			continue;
+		if (spa->spa_root_vdev == NULL)
+			continue;
+		if (spa_guid(spa) == pool_guid && (device_guid == 0 ||
+		    vdev_lookup_by_guid(spa->spa_root_vdev, device_guid)))
+			break;
+	}
+
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
+	return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+	size_t len;
+	char *new;
+
+	len = strlen(s);
+	new = kmem_alloc(len + 1, KM_SLEEP);
+	bcopy(s, new, len);
+	new[len] = '\0';
+
+	return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+	kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+	uint64_t r;
+
+	ASSERT(range != 0);
+
+	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+	return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, blkptr_t *bp)
+{
+	/* XXBP - Need to see if we want all DVAs or not */
+	dva_t *dva = BP_IDENTITY(bp);
+
+	if (bp == NULL) {
+		(void) sprintf(buf, "<NULL>");
+		return;
+	}
+
+	if (BP_IS_HOLE(bp)) {
+		(void) sprintf(buf, "<hole>");
+		return;
+	}
+
+	(void) sprintf(buf, "[L%llu %s] vdev=%llu offset=%llx "
+	    "size=%llxL/%llxP/%llxA %s %s %s %s",
+	    (u_longlong_t)BP_GET_LEVEL(bp),
+	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
+	    (u_longlong_t)DVA_GET_VDEV(dva),
+	    (u_longlong_t)DVA_GET_OFFSET(dva),
+	    (u_longlong_t)BP_GET_LSIZE(bp),
+	    (u_longlong_t)BP_GET_PSIZE(bp),
+	    (u_longlong_t)DVA_GET_ASIZE(dva),
+	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+	    DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang");
+
+	(void) sprintf(buf + strlen(buf), " birth=%llu fill=%llu"
+	    " cksum=%llx:%llx:%llx:%llx",
+	    (u_longlong_t)bp->blk_birth,
+	    (u_longlong_t)bp->blk_fill,
+	    (u_longlong_t)bp->blk_cksum.zc_word[0],
+	    (u_longlong_t)bp->blk_cksum.zc_word[1],
+	    (u_longlong_t)bp->blk_cksum.zc_word[2],
+	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+	uint64_t freeze_txg = 0;
+
+	spa_config_enter(spa, RW_WRITER);
+	if (spa->spa_freeze_txg == UINT64_MAX) {
+		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+		spa->spa_freeze_txg = freeze_txg;
+	}
+	spa_config_exit(spa);
+	if (freeze_txg != 0)
+		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+	return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+	return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+	return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+	return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+	spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+	if (spa->spa_root == NULL)
+		buf[0] = '\0';
+	else
+		(void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+	return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+	/*
+	 * Accessing the name requires holding either the namespace lock or the
+	 * config lock, both of which are required to do a rename.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+
+	return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+	return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+	return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+	return (spa->spa_freeze_txg);
+}
+
+/*
+ * In the future, this may select among different metaslab classes
+ * depending on the zdp.  For now, there's no such distinction.
+ */
+metaslab_class_t *
+spa_metaslab_class_select(spa_t *spa)
+{
+	return (spa->spa_normal_class);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+	/*
+	 * For now, the worst case is 512-byte RAID-Z blocks, in which
+	 * case the space requirement is exactly 2x; so just assume that.
+	 */
+	return (lsize << 1);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+	const spa_t *s1 = a1;
+	const spa_t *s2 = a2;
+	int s;
+
+	s = strcmp(s1->spa_name, s2->spa_name);
+	if (s > 0)
+		return (1);
+	if (s < 0)
+		return (-1);
+	return (0);
+}
+
+void
+spa_init(int mode)
+{
+	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+	    offsetof(spa_t, spa_avl));
+
+	spa_mode = mode;
+
+	refcount_init();
+	unique_init();
+	zio_init();
+	dmu_init();
+	zil_init();
+	spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+	spa_evict_all();
+
+	zil_fini();
+	dmu_fini();
+	zio_fini();
+	refcount_fini();
+
+	avl_destroy(&spa_namespace_avl);
+
+	cv_destroy(&spa_namespace_cv);
+	mutex_destroy(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
new file mode 100644
index 0000000000..25f66bf94b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+
+	if (s1->ss_start < s2->ss_start) {
+		if (s1->ss_end > s2->ss_start)
+			return (0);
+		return (-1);
+	}
+	if (s1->ss_start > s2->ss_start) {
+		if (s1->ss_start < s2->ss_end)
+			return (0);
+		return (1);
+	}
+	return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift,
+	kmutex_t *lp)
+{
+	avl_create(&sm->sm_root, space_map_seg_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+	sm->sm_start = start;
+	sm->sm_end = start + size;
+	sm->sm_size = size;
+	sm->sm_shift = shift;
+	sm->sm_space = 0;
+	sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+	VERIFY3U(sm->sm_space, ==, 0);
+	avl_destroy(&sm->sm_root);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss_before, *ss_after, *ss;
+	uint64_t end = start + size;
+	int merge_before, merge_after;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY3U(start, >=, sm->sm_start);
+	VERIFY3U(end, <=, sm->sm_end);
+	VERIFY(sm->sm_space + size <= sm->sm_size);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	/* Make sure we don't overlap with either of our neighbors */
+	VERIFY(ss == NULL);
+
+	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+	merge_before = (ss_before != NULL && ss_before->ss_end == start);
+	merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+	if (merge_before && merge_after) {
+		avl_remove(&sm->sm_root, ss_before);
+		ss_after->ss_start = ss_before->ss_start;
+		kmem_free(ss_before, sizeof (*ss_before));
+	} else if (merge_before) {
+		ss_before->ss_end = end;
+	} else if (merge_after) {
+		ss_after->ss_start = start;
+	} else {
+		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+		ss->ss_start = start;
+		ss->ss_end = end;
+		avl_insert(&sm->sm_root, ss, where);
+	}
+
+	sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss, *newseg;
+	uint64_t end = start + size;
+	int left_over, right_over;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	/* Make sure we completely overlap with someone */
+	VERIFY(ss != NULL);
+	VERIFY3U(ss->ss_start, <=, start);
+	VERIFY3U(ss->ss_end, >=, end);
+	VERIFY(sm->sm_space - size <= sm->sm_size);
+
+	left_over = (ss->ss_start != start);
+	right_over = (ss->ss_end != end);
+
+	if (left_over && right_over) {
+		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+		newseg->ss_start = end;
+		newseg->ss_end = ss->ss_end;
+		ss->ss_end = start;
+		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+	} else if (left_over) {
+		ss->ss_end = start;
+	} else if (right_over) {
+		ss->ss_start = end;
+	} else {
+		avl_remove(&sm->sm_root, ss);
+		kmem_free(ss, sizeof (*ss));
+	}
+
+	sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss;
+	uint64_t end = start + size;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+	space_seg_t *ss;
+	void *cookie = NULL;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+		if (func != NULL)
+			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+		kmem_free(ss, sizeof (*ss));
+	}
+	sm->sm_space = 0;
+}
+
+void
+space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+	space_seg_t *ss;
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_merge(space_map_t *src, space_map_t *dest)
+{
+	space_map_vacate(src, space_map_add, dest);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	avl_index_t where;
+	space_seg_t *ss, search;
+	uint64_t end = start + size;
+	uint64_t rm_start, rm_end;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	search.ss_start = start;
+	search.ss_end = start;
+
+	for (;;) {
+		ss = avl_find(t, &search, &where);
+
+		if (ss == NULL)
+			ss = avl_nearest(t, where, AVL_AFTER);
+
+		if (ss == NULL || ss->ss_start >= end)
+			break;
+
+		rm_start = MAX(ss->ss_start, start);
+		rm_end = MIN(ss->ss_end, end);
+
+		space_map_remove(sm, rm_start, rm_end - rm_start);
+	}
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+	avl_tree_t *t = &sms->sm_root;
+	space_seg_t *ss;
+
+	ASSERT(MUTEX_HELD(smd->sm_lock));
+
+	/*
+	 * For each source segment, remove any intersections with the
+	 * destination, then add the source segment to the destination.
+	 */
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+	}
+}
+
+int
+space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
+	objset_t *os, uint64_t end, uint64_t space)
+{
+	uint64_t *entry, *entry_map, *entry_map_end;
+	uint64_t bufsize, size, offset;
+	uint64_t mapstart = sm->sm_start;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY3U(sm->sm_space, ==, 0);
+
+	bufsize = MIN(end, SPACE_MAP_CHUNKSIZE);
+	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+
+	if (maptype == SM_FREE) {
+		space_map_add(sm, sm->sm_start, sm->sm_size);
+		space = sm->sm_size - space;
+	}
+
+	for (offset = 0; offset < end; offset += bufsize) {
+		size = MIN(end - offset, bufsize);
+		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+		VERIFY(size != 0);
+
+		dprintf("object=%llu  offset=%llx  size=%llx\n",
+		    smo->smo_object, offset, size);
+		dmu_read(os, smo->smo_object, offset, size, entry_map);
+
+		entry_map_end = entry_map + (size / sizeof (uint64_t));
+		for (entry = entry_map; entry < entry_map_end; entry++) {
+			uint64_t e = *entry;
+
+			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
+				continue;
+
+			(SM_TYPE_DECODE(e) == maptype ?
+			    space_map_add : space_map_remove)(sm,
+			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+			    SM_RUN_DECODE(e) << sm->sm_shift);
+		}
+	}
+	VERIFY3U(sm->sm_space, ==, space);
+
+	kmem_free(entry_map, bufsize);
+
+	return (0);
+}
+
+void
+space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo,
+    uint8_t maptype, objset_t *os, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	void *cookie = NULL;
+	space_seg_t *ss;
+	uint64_t bufsize, start, size, run_len;
+	uint64_t *entry, *entry_map, *entry_map_end;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	if (sm->sm_space == 0)
+		return;
+
+	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+	    sm->sm_space);
+
+	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+	bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE);
+	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+	entry = entry_map;
+
+	*entry++ = SM_DEBUG_ENCODE(1) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+		size = ss->ss_end - ss->ss_start;
+		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+		if (dest)
+			space_map_add(dest, ss->ss_start, size);
+
+		sm->sm_space -= size;
+		size >>= sm->sm_shift;
+
+		while (size) {
+			run_len = MIN(size, SM_RUN_MAX);
+
+			if (entry == entry_map_end) {
+				dmu_write(os, smo->smo_object, smo->smo_objsize,
+				    bufsize, entry_map, tx);
+				smo->smo_objsize += bufsize;
+				entry = entry_map;
+			}
+
+			*entry++ = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+
+			start += run_len;
+			size -= run_len;
+		}
+		kmem_free(ss, sizeof (*ss));
+	}
+
+	if (entry != entry_map) {
+		size = (entry - entry_map) * sizeof (uint64_t);
+		dmu_write(os, smo->smo_object, smo->smo_objsize,
+		    size, entry_map, tx);
+		smo->smo_objsize += size;
+	}
+
+	kmem_free(entry_map, bufsize);
+
+	VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
+    dmu_tx_t *tx)
+{
+	uint64_t oldsize = smo->smo_objsize;
+
+	dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+
+	smo->smo_objsize = 0;
+
+	VERIFY3U(sm->sm_space, ==, smo->smo_alloc);
+	space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx);
+
+	dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n",
+	    smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx));
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 0000000000..b11cd42b6d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ARC_H
+#define	_SYS_ARC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+	arc_buf_hdr_t		*b_hdr;
+	arc_buf_t		*b_next;
+	void			*b_data;
+};
+
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */
+#define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
+#define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
+
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
+void arc_buf_free(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags);
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags);
+int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags);
+int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+
+void arc_flush(void);
+void arc_tempreserve_clear(uint64_t tempreserve);
+int arc_tempreserve_space(uint64_t tempreserve);
+
+void arc_init(void);
+void arc_fini(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 0000000000..0933cb977b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_BPLIST_H
+#define	_SYS_BPLIST_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_phys {
+	/*
+	 * This is the bonus buffer for the dead lists.  The object's
+	 * contents is an array of bpl_entries blkptr_t's, representing
+	 * a total of bpl_bytes physical space.
+	 */
+	uint64_t	bpl_entries;
+	uint64_t	bpl_bytes;
+} bplist_phys_t;
+
+typedef struct bplist_q {
+	blkptr_t	bpq_blk;
+	void		*bpq_next;
+} bplist_q_t;
+
+typedef struct bplist {
+	kmutex_t	bpl_lock;
+	objset_t	*bpl_mos;
+	uint64_t	bpl_object;
+	int		bpl_blockshift;
+	int		bpl_bpshift;
+	bplist_q_t	*bpl_queue;
+	bplist_phys_t	*bpl_phys;
+	dmu_buf_t	*bpl_dbuf;
+	dmu_buf_t	*bpl_cached_dbuf;
+} bplist_t;
+
+extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
+extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
+extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern void bplist_close(bplist_t *bpl);
+extern boolean_t bplist_empty(bplist_t *bpl);
+extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
+extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 0000000000..3cf45f5985
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,302 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DBUF_H
+#define	_SYS_DBUF_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	DB_BONUS_BLKID (-1ULL)
+#define	IN_DMU_SYNC ((blkptr_t *)-1)
+
+/*
+ * define flags for dbuf_read and friends
+ */
+
+#define	DB_RF_MUST_SUCCEED	0
+#define	DB_RF_CANFAIL		(1 << 1)
+#define	DB_RF_HAVESTRUCT	(1 << 2)
+#define	DB_RF_NOPREFETCH	(1 << 3)
+
+/*
+ * The state transition diagram for dbufs looks like:
+ *
+ *		+----> READ ----+
+ *		|		|
+ *		|		V
+ *   (alloc)-->UNCACHED	     CACHED-->(free)
+ *		|		^
+ *		|		|
+ *		+----> FILL ----+
+ */
+typedef enum dbuf_states {
+	DB_UNCACHED,
+	DB_FILL,
+	DB_READ,
+	DB_CACHED
+} dbuf_states_t;
+
+struct objset_impl;
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+#define	LIST_LINK_INACTIVE(link) \
+	((link)->list_next == NULL && (link)->list_prev == NULL)
+
+typedef struct dmu_buf_impl {
+	/*
+	 * The following members are immutable, with the exception of
+	 * db.db_data, which is protected by db_mtx.
+	 */
+
+	/* the publicly visible structure */
+	dmu_buf_t db;
+
+	/* the objset we belong to */
+	struct objset_impl *db_objset;
+
+	/*
+	 * the dnode we belong to (NULL when evicted)
+	 */
+	struct dnode *db_dnode;
+
+	/*
+	 * our parent buffer; if the dnode points to us directly,
+	 * db_parent == db_dnode->dn_dbuf
+	 * only accessed by sync thread ???
+	 * (NULL when evicted)
+	 */
+	struct dmu_buf_impl *db_parent;
+
+	/*
+	 * link for hash table of all dmu_buf_impl_t's
+	 */
+	struct dmu_buf_impl *db_hash_next;
+
+	/* our block number */
+	uint64_t db_blkid;
+
+	/*
+	 * Pointer to the blkptr_t which points to us. May be NULL if we
+	 * don't have one yet. (NULL when evicted)
+	 */
+	blkptr_t *db_blkptr;
+
+	/*
+	 * Our indirection level.  Data buffers have db_level==0.
+	 * Indirect buffers which point to data buffers have
+	 * db_level==1. etc.  Buffers which contain dnodes have
+	 * db_level==0, since the dnodes are stored in a file.
+	 */
+	uint8_t db_level;
+
+	/* db_mtx protects the members below */
+	kmutex_t db_mtx;
+
+	/*
+	 * Current state of the buffer
+	 */
+	dbuf_states_t db_state;
+
+	/*
+	 * Refcount accessed by dmu_buf_{hold,rele}.
+	 * If nonzero, the buffer can't be destroyed.
+	 * Protected by db_mtx.
+	 */
+	refcount_t db_holds;
+
+	/* buffer holding our data */
+	arc_buf_t *db_buf;
+
+	kcondvar_t db_changed;
+	arc_buf_t *db_data_pending;
+
+	/*
+	 * Last time (transaction group) this buffer was dirtied.
+	 */
+	uint64_t db_dirtied;
+
+	/*
+	 * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+	 * Protected by its dn_mtx.
+	 */
+	list_node_t db_link;
+
+	/* Our link on dn_dirty_dbufs[txg] */
+	list_node_t db_dirty_node[TXG_SIZE];
+	uint8_t db_dirtycnt;
+
+	/*
+	 * Data which is unique to data (leaf) blocks:
+	 */
+	struct {
+		/* stuff we store for the user (see dmu_buf_set_user) */
+		void *db_user_ptr;
+		void **db_user_data_ptr_ptr;
+		dmu_buf_evict_func_t *db_evict_func;
+		uint8_t db_immediate_evict;
+		uint8_t db_freed_in_flight;
+
+		/*
+		 * db_data_old[txg&TXG_MASK] is set when we
+		 * dirty the buffer, so that we can retain the
+		 * pointer even if it gets COW'd in a subsequent
+		 * transaction group.
+		 *
+		 * If the buffer is dirty in any txg, it can't
+		 * be destroyed.
+		 */
+		/*
+		 * XXX Protected by db_mtx and dn_dirty_mtx.
+		 * db_mtx must be held to read db_dirty[], and
+		 * both db_mtx and dn_dirty_mtx must be held to
+		 * modify (dirty or clean). db_mtx must be held
+		 * before dn_dirty_mtx.
+		 */
+		arc_buf_t *db_data_old[TXG_SIZE];
+		blkptr_t *db_overridden_by[TXG_SIZE];
+	} db_d;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define	DBUF_MUTEXES 256
+#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+	uint64_t hash_table_mask;
+	dmu_buf_impl_t **hash_table;
+	kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+    void *tag);
+dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+    void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+void dbuf_read(dmu_buf_impl_t *db);
+int dbuf_read_canfail(dmu_buf_impl_t *db);
+void dbuf_read_havestruct(dmu_buf_impl_t *db);
+void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx);
+void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
+
+void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+    struct dmu_tx *);
+
+void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define	dprintf_dbuf(dbuf, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __db_buf[32]; \
+	uint64_t __db_obj = (dbuf)->db.db_object; \
+	if (__db_obj == DMU_META_DNODE_OBJECT) \
+		(void) strcpy(__db_buf, "mdn"); \
+	else \
+		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+		    (u_longlong_t)__db_obj); \
+	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+	    "obj=%s lvl=%u blkid=%lld " fmt, \
+	    __db_buf, (dbuf)->db_level, \
+	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __blkbuf[200];					\
+	sprintf_blkptr(__blkbuf, bp);				\
+	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	dprintf_dbuf(db, fmt, ...)
+#define	dprintf_dbuf_bp(db, bp, fmt, ...)
+
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 0000000000..f51ab89a90
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,635 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_H
+#define	_SYS_DMU_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA.  That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+	DMU_OT_NONE,
+	/* general: */
+	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
+	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
+	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
+	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
+	DMU_OT_BPLIST,			/* UINT64 */
+	DMU_OT_BPLIST_HDR,		/* UINT64 */
+	/* spa: */
+	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
+	DMU_OT_SPACE_MAP,		/* UINT64 */
+	/* zil: */
+	DMU_OT_INTENT_LOG,		/* UINT64 */
+	/* dmu: */
+	DMU_OT_DNODE,			/* DNODE */
+	DMU_OT_OBJSET,			/* OBJSET */
+	/* dsl: */
+	DMU_OT_DSL_DATASET,		/* UINT64 */
+	DMU_OT_DSL_DATASET_CHILD_MAP,	/* ZAP */
+	DMU_OT_DSL_OBJSET_SNAP_MAP,	/* ZAP */
+	DMU_OT_DSL_PROPS,		/* ZAP */
+	DMU_OT_DSL_OBJSET,		/* UINT64 */
+	/* zpl: */
+	DMU_OT_ZNODE,			/* ZNODE */
+	DMU_OT_ACL,			/* ACL */
+	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
+	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
+	DMU_OT_MASTER_NODE,		/* ZAP */
+	DMU_OT_DELETE_QUEUE,		/* ZAP */
+	/* zvol: */
+	DMU_OT_ZVOL,			/* UINT8 */
+	DMU_OT_ZVOL_PROP,		/* ZAP */
+	/* other; for testing only! */
+	DMU_OT_PLAIN_OTHER,		/* UINT8 */
+	DMU_OT_UINT64_OTHER,		/* UINT64 */
+	DMU_OT_ZAP_OTHER,		/* ZAP */
+
+	DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+	DMU_OST_NONE,
+	DMU_OST_META,
+	DMU_OST_ZFS,
+	DMU_OST_ZVOL,
+	DMU_OST_OTHER,			/* For testing only! */
+	DMU_OST_ANY,			/* Be careful! */
+	DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define	DS_MODE_NONE		0	/* invalid, to aid debugging */
+#define	DS_MODE_STANDARD	1	/* normal access, no special needs */
+#define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
+#define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
+#define	DS_MODE_LEVELS		4
+#define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
+#define	DS_MODE_READONLY	0x8
+#define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
+#define	DS_MODE_RESTORE		0x10
+#define	DS_MODE_IS_RESTORE(x)	((x) & DS_MODE_RESTORE)
+
+#define	DS_FIND_SNAPSHOTS	0x01
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_rename(const char *name, const char *newname);
+void dmu_objset_set_quota(objset_t *os, uint64_t quota);
+uint64_t dmu_objset_get_quota(objset_t *os);
+int dmu_objset_request_reservation(objset_t *os, uint64_t reservation);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+    int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+	uint64_t db_object;		/* object that this buffer is part of */
+	uint64_t db_offset;		/* byte offset in this object */
+	uint64_t db_size;		/* size of buffer in bytes */
+	void *db_data;			/* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * Callback function to perform byte swapping on a block.
+ */
+typedef void dmu_byteswap_func_t(void *buf, size_t size);
+
+#define	DMU_POOL_DIRECTORY_OBJECT	1
+#define	DMU_POOL_CONFIG			"config"
+#define	DMU_POOL_ROOT_DATASET		"root_dataset"
+#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+
+/*
+ * Allocate an object from this objset.  The range of object numbers
+ * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg.  The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number.  If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out.  It will be updated to be the next
+ * object which is allocated.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first.  If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size.  If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+    int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode.  The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+    dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode.  The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+    dmu_tx_t *tx);
+
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data.  As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
+ * buffer as well.  You must release your hold with dmu_buf_rele().
+ */
+dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
+dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_max(void);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory.  You must release the hold with
+ * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
+ * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data.  The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db);
+void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object.  A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array.  The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length, int *numbufs);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it.  Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+    void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+void dmu_buf_hold_data(dmu_buf_t *db);
+void dmu_buf_rele_data(dmu_buf_t *db);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to read the buffer's data (db_data).
+ *
+ * This routine will read the data from disk if necessary.
+ *
+ * These routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_buf_read(dmu_buf_t *db);
+int dmu_buf_read_canfail(dmu_buf_t *db);
+void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
+int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()).  The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * Indicate that you are going to modify the entire contents of the
+ * buffer's data ("fill" it).
+ *
+ * This routine is the same as dmu_buf_will_dirty, except that it won't
+ * read the contents off the disk, so the contents may be uninitialized
+ * and you must overwrite it.
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()).  The buffer's object must be held in the tx (ie.
+ * you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction.  Then you must assign
+ * the transaction to a transaction group.  Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction.  You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned.  You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define	DMU_NEW_OBJECT	(-1ULL)
+#define	DMU_OBJECT_END	(-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+    uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * Free up the data blocks for a defined range of a file.  If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size, dmu_tx_t *tx);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	void *buf);
+int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
+	uint64_t size, void *buf);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	const void *buf, dmu_tx_t *tx);
+int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    struct uio *uio, dmu_tx_t *tx);
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t len);
+
+typedef struct dmu_object_info {
+	/* All sizes are in bytes. */
+	uint32_t doi_data_block_size;
+	uint32_t doi_metadata_block_size;
+	uint64_t doi_bonus_size;
+	dmu_object_type_t doi_type;
+	dmu_object_type_t doi_bonus_type;
+	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
+	uint8_t doi_checksum;
+	uint8_t doi_compress;
+	uint8_t doi_pad[5];
+	/* Values below are number of 512-byte blocks. */
+	uint64_t doi_physical_blks;		/* data + metadata */
+	uint64_t doi_max_block_offset;
+} dmu_object_info_t;
+
+typedef struct dmu_object_type_info {
+	dmu_byteswap_func_t	*ot_byteswap;
+	boolean_t		ot_metadata;
+	char			*ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+    u_longlong_t *nblk512);
+
+/*
+ * Get the maximum nonzero offset in the object (ie. this offset and all
+ * offsets following are zero).
+ *
+ * XXX Perhaps integrate this with dmu_object_info(), although that
+ * would then have to bring in the indirect blocks.
+ */
+uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
+
+typedef struct dmu_objset_stats {
+	dmu_objset_type_t dds_type;
+	uint8_t dds_is_snapshot;
+	uint8_t dds_is_placeholder;
+	uint8_t dds_pad[2];
+
+	uint64_t dds_creation_time;
+	uint64_t dds_creation_txg;
+
+	char dds_clone_of[MAXNAMELEN];
+
+	/* How much data is there in this objset? */
+
+	/*
+	 * Space referenced, taking into account pending writes and
+	 * frees.  Only relavent to filesystems and snapshots (not
+	 * collections).
+	 */
+	uint64_t dds_space_refd;
+
+	/*
+	 * Space "used", taking into account pending writes and frees, and
+	 * children's reservations (in bytes).  This is the amount of
+	 * space that will be freed if this and all dependent items are
+	 * destroyed (eg. child datasets, objsets, and snapshots).  So
+	 * for snapshots, this is the amount of space unique to this
+	 * snapshot.
+	 */
+	uint64_t dds_space_used;
+
+	/*
+	 * Compressed and uncompressed bytes consumed.  Does not take
+	 * into account reservations.  Used for computing compression
+	 * ratio.
+	 */
+	uint64_t dds_compressed_bytes;
+	uint64_t dds_uncompressed_bytes;
+
+	/*
+	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
+	 * collisions.  The ds_guid is a 64-bit ID that will never
+	 * change, so there is a small probability that it will collide.
+	 */
+	uint64_t dds_fsid_guid;
+	uint64_t dds_guid;
+
+	uint64_t dds_objects_used;	/* number of objects used */
+	uint64_t dds_objects_avail;	/* number of objects available */
+
+	uint64_t dds_num_clones; /* number of clones of this */
+
+	/* The dataset's administratively-set quota, in bytes. */
+	uint64_t dds_quota;
+
+	/* The dataset's administratively-set reservation, in bytes */
+	uint64_t dds_reserved;
+
+	/*
+	 * The amount of additional space that this dataset can consume.
+	 * Takes into account quotas & reservations.
+	 * (Assuming that no other datasets consume it first.)
+	 */
+	uint64_t dds_available;
+
+	/*
+	 * Various properties.
+	 */
+	uint64_t dds_compression;
+	uint64_t dds_checksum;
+	uint64_t dds_zoned;
+	char dds_compression_setpoint[MAXNAMELEN];
+	char dds_checksum_setpoint[MAXNAMELEN];
+	char dds_zoned_setpoint[MAXNAMELEN];
+	char dds_altroot[MAXPATHLEN];
+
+	/* The following are for debugging purposes only */
+	uint64_t dds_last_txg;
+	uint64_t dds_dir_obj;
+	uint64_t dds_objset_obj;
+	uint64_t dds_clone_of_obj;
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *id, uint64_t *offp);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* XXX */
+
+/*
+ * Synchronous write.
+ * On success returns 0 and fills in the blk pointed at by bp.
+ * Note that while the data covered by this function will be on stable
+ * storage when the function returns this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+    struct blkptr *bp, uint64_t txg);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+    uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+    uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+    dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
+int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+    struct vnode *vp, uint64_t voffset);
+
+/* CRC64 table */
+#define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 0000000000..b6e8b62ec2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define	_SYS_DMU_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU.  Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * 	dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ *
+ * dp_config_rwlock
+ *    must be held before: everything
+ *    protects dd namespace changes
+ *    protects property changes globally
+ *    held from:
+ *    	dsl_dir_open/r:
+ *    	dsl_dir_create_sync/w:
+ *    	dsl_dir_sync_destroy/w:
+ *    	dsl_dir_rename_sync/w:
+ *    	dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ *   must be held before:
+ *   	everything except dp_config_rwlock
+ *   protects os_obj_next
+ *   held from:
+ *   	dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ *   must be held before:
+ *   	everything except dp_config_rwlock and os_obj_lock
+ *   protects structure of dnode (eg. nlevels)
+ *   	db_blkptr can change when syncing out change to nlevels
+ *   	dn_maxblkid
+ *   	dn_nlevels
+ *   	dn_*blksz*
+ *   	phys nlevels, maxblkid, physical blkptr_t's (?)
+ *   held from:
+ *   	callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ *   	dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ *   	dmu_tx_count_free:
+ *   	dbuf_read_impl: db_mtx, dmu_zfetch()
+ *   	dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ *   	dbuf_new_size: db_mtx
+ *   	dbuf_dirty: db_mtx
+ *	dbuf_findbp: (callers, phys? - the real need)
+ *	dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ *	dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ *	dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ *	dnode_sync/w (increase_indirection): db_mtx (phys)
+ *	dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ *	dnode_new_blkid/w: (dn_maxblkid)
+ *	dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ *	dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ *    must be held before:
+ *    	db_mtx, hash_mutexes
+ *    protects:
+ *    	dn_dbufs
+ *    	dn_evicted
+ *    held from:
+ *    	dmu_evict_user: db_mtx (dn_dbufs)
+ *    	dbuf_free_range: db_mtx (dn_dbufs)
+ *    	dbuf_remove_ref: db_mtx, callees:
+ *    		dbuf_hash_remove: hash_mutexes, db_mtx
+ *    	dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ *    	dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ *   must be held before:
+ *   	db_mtx
+ *   protects dbuf_hash_table (global) and db_hash_next
+ *   held from:
+ *   	dbuf_find: db_mtx
+ *   	dbuf_hash_insert: db_mtx
+ *   	dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ *   must be held before:
+ *   	dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ *   protects:
+ *   	db_state
+ * 	db_holds
+ * 	db_buf
+ * 	db_changed
+ * 	db_data_pending
+ * 	db_dirtied
+ * 	db_link
+ * 	db_dirty_node (??)
+ * 	db_dirtycnt
+ * 	db_d.*
+ * 	db.*
+ *   held from:
+ * 	dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * 	dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * 	dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * 	dbuf_undirty: dn_dirty_mtx (db_d)
+ * 	dbuf_write_done: dn_dirty_mtx (db_state)
+ * 	dbuf_*
+ * 	dmu_buf_update_user: none (db_d)
+ * 	dmu_evict_user: none (db_d) (maybe can eliminate)
+ *   	dbuf_find: none (db_holds)
+ *   	dbuf_hash_insert: none (db_holds)
+ *   	dmu_buf_read_array_impl: none (db_state, db_changed)
+ *   	dmu_sync: none (db_dirty_node, db_d)
+ *   	dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ *   protects:
+ *   	dn_dirty_dbufs
+ *   	dn_ranges
+ *   	phys accounting
+ * 	dn_allocated_txg
+ * 	dn_free_txg
+ * 	dn_assigned_txg
+ * 	dd_assigned_tx
+ * 	dn_notxholds
+ * 	dn_dirtyctx
+ * 	dn_dirtyctx_firstset
+ * 	(dn_phys copy fields?)
+ * 	(dn_phys contents?)
+ *   held from:
+ *   	dnode_*
+ *   	dbuf_dirty: none
+ *   	dbuf_sync: none (phys accounting)
+ *   	dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ *   	dbuf_write_done: none (phys accounting)
+ *   	dmu_object_info_from_dnode: none (accounting)
+ *   	dmu_tx_commit: none
+ *   	dmu_tx_hold_object_impl: none
+ *   	dmu_tx_try_assign: dn_notxholds(cv)
+ *   	dmu_tx_unassign: none
+ *
+ * dd_lock (leaf)
+ *    protects:
+ *    	dd_prop_cbs
+ *    	dd_sync_*
+ *    	dd_used_bytes
+ *    	dd_tempreserved
+ *    	dd_space_towrite
+ *    	dd_myname
+ *    	dd_phys accounting?
+ *    held from:
+ *    	dsl_dir_*
+ *    	dsl_prop_changed_notify: none (dd_prop_cbs)
+ *    	dsl_prop_register: none (dd_prop_cbs)
+ *    	dsl_prop_unregister: none (dd_prop_cbs)
+ *    	dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ *   protects:
+ *   	os_dirty_dnodes
+ *   	os_free_dnodes
+ *   	os_dnodes
+ *   	os_downgraded_dbufs
+ *   	dn_dirtyblksz
+ *   	dn_dirty_link
+ *   held from:
+ *   	dnode_create: none (os_dnodes)
+ *   	dnode_destroy: none (os_dnodes)
+ *   	dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ *   	dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock (leaf)
+ *    protects:
+ *    	ds_user_ptr
+ *    	ds_user_evice_func
+ *    	ds_open_refcount
+ *    	ds_snapname
+ *    	ds_phys accounting
+ *    held from:
+ *    	dsl_dataset_*
+ *
+ */
+
+struct objset;
+struct dmu_pool;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 0000000000..d0a77fcfb9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_OBJSET_H
+#define	_SYS_DMU_OBJSET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dmu_tx;
+struct objset_impl;
+
+typedef struct objset_phys {
+	dnode_phys_t os_meta_dnode;
+	zil_header_t os_zil_header;
+	uint64_t os_type;
+	char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+	    sizeof (uint64_t)];
+} objset_phys_t;
+
+struct objset {
+	struct objset_impl *os;
+	int os_mode;
+};
+
+typedef struct objset_impl {
+	/* Immutable: */
+	struct dsl_dataset *os_dsl_dataset;
+	spa_t *os_spa;
+	objset_phys_t *os_phys;
+	dnode_t *os_meta_dnode;
+	zilog_t *os_zil;
+	objset_t os;
+	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
+	uint8_t os_compress;	/* can change, under dsl_dir's locks */
+	uint8_t os_md_checksum;
+	uint8_t os_md_compress;
+
+	/* no lock needed: */
+	struct dmu_tx *os_synctx; /* XXX sketchy */
+	blkptr_t os_rootbp;
+
+	/* Protected by os_obj_lock */
+	kmutex_t os_obj_lock;
+	uint64_t os_obj_next;
+
+	/* Protected by os_lock */
+	kmutex_t os_lock;
+	list_t os_dirty_dnodes[TXG_SIZE];
+	list_t os_free_dnodes[TXG_SIZE];
+	list_t os_dnodes;
+	list_t os_downgraded_dbufs;
+} objset_impl_t;
+
+#define	DMU_PRIVATE_OBJECT		(1ULL << 63)
+
+#define	DMU_META_DNODE_OBJECT		(1ULL << 63)
+
+/* XXX rename this to DMU_IS_DNODE_OBJECT? */
+#define	IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+
+/* called from zpl */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+    int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+/* called from dsl */
+void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+    dmu_objset_type_t type, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
+    blkptr_t *bp);
+void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 0000000000..7087912e00
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_TRAVERSE_H
+#define	_SYS_DMU_TRAVERSE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/arc.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ADVANCE_POST	0		/* post-order traversal */
+#define	ADVANCE_PRE	0x01		/* pre-order traversal */
+#define	ADVANCE_PRUNE	0x02		/* prune by prev snapshot birth time */
+#define	ADVANCE_DATA	0x04		/* read user data blocks */
+#define	ADVANCE_HOLES	0x08		/* visit holes */
+#define	ADVANCE_NOLOCK	0x10		/* Don't grab SPA sync lock */
+
+#define	ZB_NO_LEVEL	-2
+#define	ZB_MAXLEVEL	32		/* Next power of 2 >= DN_MAX_LEVELS */
+#define	ZB_MAXBLKID	(1ULL << 62)
+#define	ZB_MAXOBJSET	(1ULL << 62)
+#define	ZB_MAXOBJECT	(1ULL << 62)
+
+#define	ZB_MOS_CACHE	0
+#define	ZB_MDN_CACHE	1
+#define	ZB_DN_CACHE	2
+#define	ZB_DEPTH	3
+
+typedef struct zbookmark {
+	uint64_t	zb_objset;
+	uint64_t	zb_object;
+	int		zb_level;
+	uint64_t	zb_blkid;
+} zbookmark_t;
+
+typedef struct zseg {
+	uint64_t	seg_mintxg;
+	uint64_t	seg_maxtxg;
+	zbookmark_t	seg_start;
+	zbookmark_t	seg_end;
+	list_node_t	seg_node;
+} zseg_t;
+
+typedef struct traverse_blk_cache {
+	zbookmark_t	bc_bookmark;
+	blkptr_t	bc_blkptr;
+	void		*bc_data;
+	dnode_phys_t	*bc_dnode;
+	int		bc_errno;
+	int		bc_pad1;
+	uint64_t	bc_pad2;
+} traverse_blk_cache_t;
+
+typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
+
+struct traverse_handle {
+	spa_t		*th_spa;
+	blkptr_cb_t	*th_func;
+	void		*th_arg;
+	int		th_advance;
+	int		th_zio_flags;
+	list_t		th_seglist;
+	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+	uint64_t	th_hits;
+	uint64_t	th_arc_hits;
+	uint64_t	th_reads;
+	uint64_t	th_callbacks;
+	uint64_t	th_syncs;
+	uint64_t	th_restarts;
+	zbookmark_t	th_noread;
+	zbookmark_t	th_lastcb;
+};
+
+int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+    int advance, blkptr_cb_t func, void *arg);
+
+traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
+    int advance, int zio_flags);
+void traverse_fini(traverse_handle_t *th);
+
+void traverse_add_dnode(traverse_handle_t *th,
+    uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
+void traverse_add_objset(traverse_handle_t *th,
+    uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
+void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+
+int traverse_more(traverse_handle_t *th);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 0000000000..5d2f1127ce
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_TX_H
+#define	_SYS_DMU_TX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+	/*
+	 * No synchronization is needed because a tx can only be handled
+	 * by one thread.
+	 */
+	list_t tx_holds; /* list of dmu_tx_hold_t */
+	objset_t *tx_objset;
+	struct dsl_dir *tx_dir;
+	struct dsl_pool *tx_pool;
+	uint64_t tx_txg;
+	txg_handle_t tx_txgh;
+	uint64_t tx_space_towrite;
+	refcount_t tx_space_written;
+	uint64_t tx_space_tofree;
+	refcount_t tx_space_freed;
+	uint64_t tx_space_tooverwrite;
+	void *tx_tempreserve_cookie;
+	uint8_t tx_anyobj;
+	uint8_t tx_privateobj;
+#ifdef ZFS_DEBUG
+	char *tx_debug_buf;
+	int tx_debug_len;
+#endif
+};
+
+enum dmu_tx_hold_type {
+	THT_NEWOBJECT,
+	THT_WRITE,
+	THT_BONUS,
+	THT_FREE,
+	THT_ZAP,
+	THT_SPACE,
+	THT_NUMTYPES
+};
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
+
+typedef struct dmu_tx_hold {
+	list_node_t dth_node;
+	struct dnode *dth_dnode;
+	enum dmu_tx_hold_type dth_type;
+	dmu_tx_hold_func_t dth_func;
+	uint64_t dth_arg1;
+	uint64_t dth_arg2;
+	/* XXX track what the actual estimates were for this hold */
+} dmu_tx_hold_t;
+
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_ds(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+
+extern int dmu_use_tx_debug_bufs;
+
+#define	dprintf_tx(tx, fmt, ...) \
+	if (dmu_use_tx_debug_bufs) \
+	do { \
+	char *__bufp; \
+	int __len; \
+	if (tx->tx_debug_buf == NULL) { \
+		__bufp = kmem_zalloc(4096, KM_SLEEP); \
+		tx->tx_debug_buf = __bufp; \
+		tx->tx_debug_len = __len = 4096; \
+	} else { \
+		__len = tx->tx_debug_len; \
+		__bufp = &tx->tx_debug_buf[4096-__len]; \
+	} \
+	tx->tx_debug_len -= snprintf(__bufp, __len, fmt, __VA_ARGS__); \
+_NOTE(CONSTCOND) } while (0); \
+	else dprintf(fmt, __VA_ARGS__)
+
+#else
+
+#define	dprintf_tx(tx, fmt, ...)
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_TX_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 0000000000..35466d6874
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DFETCH_H
+#define	_DFETCH_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern uint64_t	zfetch_array_rd_sz;
+
+struct dnode;				/* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+	ZFETCH_FORWARD = 1,		/* prefetch increasing block numbers */
+	ZFETCH_BACKWARD	= -1		/* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+	uint64_t	zst_offset;	/* offset of starting block in range */
+	uint64_t	zst_len;	/* length of range, in blocks */
+	zfetch_dirn_t	zst_direction;	/* direction of prefetch */
+	uint64_t	zst_stride;	/* length of stride, in blocks */
+	uint64_t	zst_ph_offset;	/* prefetch offset, in blocks */
+	uint64_t	zst_cap;	/* prefetch limit (cap), in blocks */
+	kmutex_t	zst_lock;	/* protects stream */
+	clock_t		zst_last;	/* lbolt of last prefetch */
+	avl_node_t	zst_node;	/* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+	krwlock_t	zf_rwlock;	/* protects zfetch structure */
+	list_t		zf_stream;	/* AVL tree of zstream_t's */
+	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
+	uint32_t	zf_stream_cnt;	/* # of active streams */
+	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void		dmu_zfetch_init(zfetch_t *, struct dnode *);
+void		dmu_zfetch_rele(zfetch_t *);
+void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _DFETCH_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 0000000000..2a5ef92b52
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DNODE_H
+#define	_SYS_DNODE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Flags.
+ */
+#define	DNODE_MUST_BE_ALLOCATED	1
+#define	DNODE_MUST_BE_FREE	2
+
+/*
+ * Fixed constants.
+ */
+#define	DNODE_SHIFT		9	/* 512 bytes */
+#define	DN_MIN_INDBLKSHIFT	10	/* 1k */
+#define	DN_MAX_INDBLKSHIFT	14	/* 16k */
+#define	DNODE_BLOCK_SHIFT	14	/* 16k */
+#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
+#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
+#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define	DNODE_SIZE	(1 << DNODE_SHIFT)
+#define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+
+#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
+#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+#define	DN_META_DNODE_LEVELS	\
+	(1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT -	\
+	DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define	DN_MAX_OBJECT		\
+	((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT +	\
+	(DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
+
+#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
+	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset_impl;
+struct zio;
+
+enum dnode_dirtycontext {
+	DN_UNDIRTIED,
+	DN_DIRTY_OPEN,
+	DN_DIRTY_SYNC
+};
+
+typedef struct dnode_phys {
+	uint8_t dn_type;		/* dmu_object_type_t */
+	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
+	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
+	uint8_t dn_nblkptr;		/* length of dn_blkptr */
+	uint8_t dn_bonustype;		/* type of data in bonus buffer */
+	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
+	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
+	uint8_t dn_pad1[1];
+	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
+	uint16_t dn_bonuslen;		/* length of dn_bonus */
+	uint8_t dn_pad2[4];
+
+	/* accounting is protected by dn_dirty_mtx */
+	uint64_t dn_maxblkid;		/* largest allocated block ID */
+	uint64_t dn_secphys;		/* 512b sectors of disk space used */
+
+	uint64_t dn_pad3[4];
+
+	blkptr_t dn_blkptr[1];
+	uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef struct dnode {
+	/*
+	 * lock ordering:
+	 *
+	 * db_mtx > dn_dirty_mtx
+	 * 	dbuf_syncdone
+	 *
+	 * dn_struct_rwlock/r > dn_dirty_mtx
+	 * 	dmu_object_info
+	 *
+	 * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
+	 * 	dbuf_dirty
+	 * 	dbuf_setdirty
+	 *
+	 * dn_struct_rwlock/w > db_mtx > dn_mtx
+	 * 	dnode_increase_indirection -> dbuf_find
+	 * 	dbuf_hold_impl
+	 * 	dnode_set_bonus
+	 *
+	 * dn_struct_rwlock/w > dn_mtx
+	 * 	dnode_increase_indirection
+	 *
+	 * dn_dirty_mtx > dn_mtx
+	 * 	dnode_buf_pageout
+	 *
+	 * db_mtx > dn_mtx
+	 * 	dbuf_create
+	 */
+
+	/*
+	 * dn_struct_rwlock protects the structure of the dnode.
+	 * In particular, it protects the number of levels of indirection.
+	 */
+	krwlock_t dn_struct_rwlock;
+
+	/*
+	 * Our link on dataset's dd_dnodes list.
+	 * Protected by dd_accounting_mtx.
+	 */
+	list_node_t dn_link;
+
+	/* immutable: */
+	struct objset_impl *dn_objset;
+	uint64_t dn_object;
+	struct dmu_buf_impl *dn_dbuf;
+	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+	/*
+	 * Copies of stuff in dn_phys.  They're valid here even before
+	 * the dnode is first synced.
+	 */
+	dmu_object_type_t dn_type;	/* object type (immutable) */
+	uint8_t dn_bonustype;		/* bonus type (immutable) */
+	uint16_t dn_bonuslen;		/* bonus length (immutable) */
+	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
+	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
+	uint32_t dn_datablksz;		/* in bytes */
+	uint16_t dn_datablkszsec;	/* in 512b sectors */
+
+	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
+	uint8_t dn_compress;		/* ZIO_COMPRESS type */
+
+	/*
+	 * The following are kept up-to-date in the *open* context, the syncing
+	 * context should only pay attention to the dn_next_* values.
+	 */
+	uint8_t dn_nlevels;
+	uint8_t dn_indblkshift;
+
+	uint8_t dn_next_nlevels[TXG_SIZE];
+	uint8_t dn_next_indblkshift[TXG_SIZE];
+
+	/* protected by os_lock: */
+	uint32_t dn_dirtyblksz[TXG_SIZE];	/* dirty block size in bytes */
+	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
+
+	/* protected by dn_mtx: */
+	kmutex_t dn_mtx;
+	list_t dn_dirty_dbufs[TXG_SIZE];
+	uint64_t dn_maxblkid;
+	avl_tree_t dn_ranges[TXG_SIZE];
+	uint64_t dn_allocated_txg;
+	uint64_t dn_free_txg;
+	uint64_t dn_assigned_txg;
+	struct dmu_tx *dn_assigned_tx;		/* if only one tx cares */
+	kcondvar_t dn_notxholds;
+	enum dnode_dirtycontext dn_dirtyctx;
+	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
+
+	/* protected by own devices */
+	refcount_t dn_tx_holds;
+	refcount_t dn_holds;
+
+	kmutex_t dn_dbufs_mtx;
+	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
+	kcondvar_t dn_evicted;		/* a child dbuf has been evicted */
+
+	/*
+	 * Performance hack: whenever we have a hold on the bonus buffer of a
+	 * ZAP object, we will also have a hold on db0.  This will keep the
+	 * meta-data for a micro-zap object cached as long as the znode for the
+	 * object is in the znode cache.
+	 */
+	struct dmu_buf_impl *dn_db0;
+
+	/* holds prefetch structure */
+	struct zfetch	dn_zfetch;
+} dnode_t;
+
+typedef struct free_range {
+	avl_node_t fr_node;
+	uint64_t fr_blkid;
+	uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+    uint64_t object);
+void dnode_special_close(dnode_t *dn);
+
+dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
+dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+    void *ref);
+void dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+uint64_t dnode_max_nonzero_offset(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+    uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
+    uint64_t blkfill);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define	dprintf_dnode(dn, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __db_buf[32]; \
+	uint64_t __db_obj = (dn)->dn_object; \
+	if (__db_obj == DMU_META_DNODE_OBJECT) \
+		(void) strcpy(__db_buf, "mdn"); \
+	else \
+		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+		    (u_longlong_t)__db_obj);\
+	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+	    __db_buf, __VA_ARGS__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	dprintf_dnode(db, fmt, ...)
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 0000000000..e56c8a67d9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_DATASET_H
+#define	_SYS_DSL_DATASET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
+
+typedef struct dsl_dataset_phys {
+	uint64_t ds_dir_obj;
+	uint64_t ds_prev_snap_obj;
+	uint64_t ds_prev_snap_txg;
+	uint64_t ds_next_snap_obj;
+	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
+	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
+	uint64_t ds_creation_time;	/* seconds since 1970 */
+	uint64_t ds_creation_txg;
+	uint64_t ds_deadlist_obj;
+	uint64_t ds_used_bytes;
+	uint64_t ds_compressed_bytes;
+	uint64_t ds_uncompressed_bytes;
+	uint64_t ds_unique_bytes;	/* only relavent to snapshots */
+	/*
+	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
+	 * collisions.  The ds_guid is a 64-bit ID that will never
+	 * change, so there is a small probability that it will collide.
+	 */
+	uint64_t ds_fsid_guid;
+	uint64_t ds_guid;
+	uint64_t ds_restoring; /* boolean */
+	blkptr_t ds_bp;
+	uint64_t ds_pad[8]; /* pad out to 256 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+	/* Immutable: */
+	struct dsl_dir *ds_dir;
+	dsl_dataset_phys_t *ds_phys;
+	dmu_buf_t *ds_dbuf;
+	uint64_t ds_object;
+
+	/* only used in syncing context: */
+	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+
+	/* has internal locking: */
+	bplist_t ds_deadlist;
+
+	/* protected by lock on pool's dp_dirty_datasets list */
+	txg_node_t ds_dirty_link;
+	list_node_t ds_synced_link;
+
+	/*
+	 * ds_phys->ds_<accounting> is also protected by ds_lock.
+	 * Protected by ds_lock:
+	 */
+	kmutex_t ds_lock;
+	void *ds_user_ptr;
+	dsl_dataset_evict_func_t *ds_user_evict_func;
+	uint64_t ds_open_refcount;
+
+	/* Protected by ds_lock; keep at end of struct for better locality */
+	char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+#define	dsl_dataset_is_snapshot(ds)	\
+	((ds)->ds_phys->ds_num_children != 0)
+
+int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+    void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_open(const char *name, int mode, void *tag,
+    dsl_dataset_t **dsp);
+dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+    const char *tail, int mode, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
+int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
+int dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_destroy(const char *name);
+int dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rollback(const char *name);
+int dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rename(const char *name, const char *newname);
+
+void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+    void *p, dsl_dataset_evict_func_t func);
+void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+
+void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
+    dmu_tx_t *tx);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
+struct dsl_pool *dsl_dataset_pool(dsl_dataset_t *ds);
+
+void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
+    dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_ds(ds, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	dsl_dataset_name(ds, __ds_name); \
+	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+	kmem_free(__ds_name, MAXNAMELEN); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 0000000000..0499d731e6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_DIR_H
+#define	_SYS_DSL_DIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef struct dsl_dir_phys {
+	uint64_t dd_creation_time;
+	uint64_t dd_head_dataset_obj;
+	uint64_t dd_parent_obj;
+	uint64_t dd_clone_parent_obj;
+	uint64_t dd_child_dir_zapobj;
+	/*
+	 * how much space our children are accounting for; for leaf
+	 * datasets, == physical space used by fs + snaps
+	 */
+	uint64_t dd_used_bytes;
+	uint64_t dd_compressed_bytes;
+	uint64_t dd_uncompressed_bytes;
+	/* Administrative quota setting */
+	uint64_t dd_quota;
+	/* Administrative reservation setting */
+	uint64_t dd_reserved;
+	uint64_t dd_props_zapobj;
+	uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+	/* These are immutable; no lock needed: */
+	uint64_t dd_object;
+	dsl_dir_phys_t *dd_phys;
+	dmu_buf_t *dd_dbuf;
+	dsl_pool_t *dd_pool;
+
+	/* protected by lock on pool's dp_dirty_dirs list */
+	txg_node_t dd_dirty_link;
+
+	/* protected by dp_config_rwlock */
+	dsl_dir_t *dd_parent;
+
+	/* Protected by dd_lock */
+	kmutex_t dd_lock;
+	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	/* Thing to do when we sync */
+	uint64_t dd_sync_txg;
+	int (*dd_sync_func)(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+	void *dd_sync_arg;
+	int dd_sync_err;
+
+	/* Accounting */
+	/* reflects any changes to dd_phys->dd_used_bytes made this syncing */
+	int64_t dd_used_bytes;
+	/* int64_t dd_compressed_bytes; */
+	/* int64_t dd_uncompressed_bytes; */
+	/* gross estimate of space used by in-flight tx's */
+	uint64_t dd_tempreserved[TXG_SIZE];
+	/* amount of space we expect to write; == amount of dirty data */
+	int64_t dd_space_towrite[TXG_SIZE];
+
+	/* protected by dd_lock; keep at end of struct for better locality */
+	char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
+dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+    const char **tailp);
+dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_is_private(dsl_dir_t *dd);
+int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
+void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+int dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx);
+void dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd,
+    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+int dsl_dir_sync_task(dsl_dir_t *dd,
+    int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space);
+int dsl_dir_set_quota(const char *ddname, uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_dd(dd, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	dsl_dir_name(dd, __ds_name); \
+	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+	kmem_free(__ds_name, MAXNAMELEN); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 0000000000..4fca4548ad
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_POOL_H
+#define	_SYS_DSL_POOL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+
+typedef struct dsl_pool {
+	/* Immutable */
+	spa_t *dp_spa;
+	struct objset *dp_meta_objset;
+	struct dsl_dir *dp_root_dir;
+	struct dsl_dir *dp_mos_dir;
+	uint64_t dp_root_dir_obj;
+
+	/* No lock needed - sync context only */
+	blkptr_t dp_meta_rootbp;
+	list_t dp_synced_objsets;
+
+	/* Has its own locking */
+	tx_state_t dp_tx;
+	txg_list_t dp_dirty_datasets;
+	txg_list_t dp_dirty_dirs;
+
+	/*
+	 * Protects administrative changes (properties, namespace)
+	 * It is only held for write in syncing context.  Therefore
+	 * syncing context does not need to ever have it for read, since
+	 * nobody else could possibly have it for write.
+	 */
+	krwlock_t dp_config_rwlock;
+} dsl_pool_t;
+
+dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_zil_clean(dsl_pool_t *dp);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 0000000000..ea810b03ab
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_PROP_H
+#define	_SYS_DSL_PROP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+#define	DSL_PROP_VALUE_UNDEFINED (-1ULL)
+
+typedef struct dsl_prop_cb_record {
+	list_node_t cbr_node; /* link on dd_prop_cbs */
+	const char *cbr_propname;
+	dsl_prop_changed_cb_t *cbr_func;
+	void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_string(const char *ddname, const char *propname,
+    char *value, int valuelen, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+    uint64_t *valuep, char *setpoint);
+int dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+    uint64_t *valuep, char *setpoint);
+
+int dsl_prop_set(const char *ddname, const char *propname,
+    int intsz, int numints, const void *buf);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DSL_PROP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 0000000000..e592b388fd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define	_SYS_METASLAB_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct metaslab_class metaslab_class_t;
+typedef struct metaslab_group metaslab_group_t;
+
+extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+    metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+
+extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg);
+extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(void);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
+extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+    vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight);
+extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp);
+extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_METASLAB_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 0000000000..5b1e388727
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define	_SYS_METASLAB_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+	metaslab_group_t	*mc_rotor;
+	uint64_t		mc_allocated;
+};
+
+struct metaslab_group {
+	kmutex_t		mg_lock;
+	avl_tree_t		mg_metaslab_tree;
+	uint64_t		mg_aliquot;
+	int64_t			mg_bias;
+	metaslab_class_t	*mg_class;
+	vdev_t			*mg_vd;
+	metaslab_group_t	*mg_prev;
+	metaslab_group_t	*mg_next;
+};
+
+/*
+ * Each metaslab's free block list is kept in its own DMU object in the
+ * metaslab freelist dataset.  To minimize space consumption, the list
+ * is circular.
+ *
+ * Allocations and frees can happen in multiple transaction groups at
+ * the same time, which makes it a bit challening to keep the metaslab
+ * consistent.  For example, we cannot allow frees from different
+ * transaction groups to be interleaved in the metaslab's free block list.
+ *
+ * We address this in several ways:
+ *
+ *	We don't allow allocations from the same metaslab in concurrent
+ *	transaction groups.  metaslab_alloc() enforces this by checking
+ *	the ms_last_alloc field, which specifies the last txg in which
+ *	the metaslab was used for allocations.
+ *
+ *	We can't segregate frees this way because we can't choose which
+ *	DVAs someone wants to free.  So we keep separate in-core freelists
+ *	for each active transaction group.  This in-core data is only
+ *	written to the metaslab's on-disk freelist in metaslab_sync(),
+ *	which solves the interleave problem: we only append frees from
+ *	the syncing txg to the on-disk freelist, so the appends all occur
+ *	in txg order.
+ *
+ *	We cannot allow a block which was freed in a given txg to be
+ *	allocated again until that txg has closed; otherwise, if we
+ *	failed to sync that txg and had to roll back to txg - 1,
+ *	changes in txg + 1 could have overwritten the data.  Therefore,
+ *	we partition the free blocks into "available" and "limbo" states.
+ *	A block is available if the txg in which it was freed has closed;
+ *	until then, the block is in limbo.  Each time metaslab_sync() runs,
+ *	if first adds any limbo blocks to the avail list, clears the limbo
+ *	list, and starts writing the new limbo blocks (i.e. the ones that
+ *	were freed in the syncing txg).
+ */
+
+struct metaslab {
+	kmutex_t	ms_lock;	/* metaslab lock		*/
+	space_map_obj_t	*ms_smo;	/* space map object		*/
+	uint64_t	ms_last_alloc;	/* txg of last alloc		*/
+	uint64_t	ms_usable_end;	/* end of free_obj at last sync	*/
+	uint64_t	ms_usable_space; /* usable space at last sync	*/
+	metaslab_group_t *ms_group;	/* metaslab group		*/
+	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
+	uint64_t	ms_weight;	/* weight vs. others in group	*/
+	uint8_t		ms_dirty[TXG_SIZE];	/* per-txg dirty flags	*/
+	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
+	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+	space_map_t	ms_map;		/* in-core free space map	*/
+	uint8_t		ms_map_incore;  /* space map contents are valid */
+	uint64_t	ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD	*/
+};
+
+/*
+ * ms_dirty[] flags
+ */
+#define	MSD_ALLOC	0x01	/* allocated from in this txg		*/
+#define	MSD_FREE	0x02	/* freed to in this txg			*/
+#define	MSD_ADD		0x04	/* added to the pool in this txg	*/
+#define	MSD_CONDENSE	0x08	/* condensed in this txg		*/
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_METASLAB_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 0000000000..f9fffd2443
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_REFCOUNT_H
+#define	_SYS_REFCOUNT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define	FTAG ((void*)__func__)
+
+#if defined(DEBUG) || !defined(_KERNEL)
+typedef struct reference {
+	list_node_t ref_link;
+	void *ref_holder;
+	uint64_t ref_number;
+	uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+	kmutex_t rc_mtx;
+	list_t rc_list;
+	list_t rc_removed;
+	int64_t rc_count;
+	int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t should be initialized to zero before use. */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* DEBUG */
+
+typedef struct refcount {
+	uint64_t rc_count;
+} refcount_t;
+
+#define	refcount_create(rc) ((rc)->rc_count = 0)
+#define	refcount_destroy(rc) ((rc)->rc_count = 0)
+#define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define	refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define	refcount_count(rc) ((rc)->rc_count)
+#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define	refcount_add_many(rc, number, holder) \
+	atomic_add_64_nv(&(rc)->rc_count, number)
+#define	refcount_remove_many(rc, number, holder) \
+	atomic_add_64_nv(&(rc)->rc_count, -number)
+
+#define	refcount_init()
+#define	refcount_fini()
+
+#endif /* DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 0000000000..9bf0f89d49
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define	_SYS_SPA_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct zilog zilog_t;
+typedef struct traverse_handle traverse_handle_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
+#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
+#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
+#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
+
+#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
+#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
+
+#define	BF32_SET(x, low, len, val)	\
+	((x) ^= BF32_ENCODE((x >> low) ^ val, low, len))
+#define	BF64_SET(x, low, len, val)	\
+	((x) ^= BF64_ENCODE((x >> low) ^ val, low, len))
+
+#define	BF32_GET_SB(x, low, len, shift, bias)	\
+	((BF32_GET(x, low, len) + (bias)) << (shift))
+#define	BF64_GET_SB(x, low, len, shift, bias)	\
+	((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
+	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
+	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define	SPA_MINBLOCKSHIFT	9
+#define	SPA_MAXBLOCKSHIFT	17
+#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
+#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
+
+#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
+#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
+#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+	uint64_t	dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+	uint64_t	zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ *	64	56	48	40	32	24	16	8	0
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0	|		vdev1		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1	|G|			 offset1				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2	|		vdev2		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3	|G|			 offset2				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4	|		vdev3		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5	|G|			 offset3				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * a	|			birth txg				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * b	|			fill count				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * c	|			checksum[0]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * d	|			checksum[1]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * e	|			checksum[2]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * f	|			checksum[3]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev		virtual device ID
+ * offset	offset into virtual device
+ * LSIZE	logical size
+ * PSIZE	physical size (after compression)
+ * ASIZE	allocated size (including RAID-Z parity and gang block headers)
+ * GRID		RAID-Z layout information (reserved for future use)
+ * cksum	checksum function
+ * comp		compression function
+ * G		gang block indicator
+ * E		endianness
+ * type		DMU object type
+ * lvl		level of indirection
+ * birth txg	transaction group in which the block was born
+ * fill count	number of non-zero blocks under this bp
+ * checksum[4]	256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
+	uint64_t	blk_prop;	/* size, compression, type, etc	*/
+	uint64_t	blk_pad[3];	/* Extra space for the future	*/
+	uint64_t	blk_birth;	/* transaction group at birth	*/
+	uint64_t	blk_fill;	/* fill count			*/
+	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
+} blkptr_t;
+
+#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
+#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define	DVA_GET_ASIZE(dva)	\
+	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define	DVA_SET_ASIZE(dva, x)	\
+	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
+#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
+#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define	DVA_GET_OFFSET(dva)	\
+	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define	DVA_SET_OFFSET(dva, x)	\
+	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
+#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define	BP_GET_LSIZE(bp)	\
+	(BP_IS_HOLE(bp) ? 0 : \
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_LSIZE(bp, x)	\
+	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	BP_GET_PSIZE(bp)	\
+	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define	BP_SET_PSIZE(bp, x)	\
+	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
+#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
+#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
+#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
+#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define	BP_GET_ASIZE(bp)	\
+	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define	DVA_EQUAL(dva1, dva2)	\
+	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+	(dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
+
+#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
+{						\
+	(zcp)->zc_word[0] = w0;			\
+	(zcp)->zc_word[1] = w1;			\
+	(zcp)->zc_word[2] = w2;			\
+	(zcp)->zc_word[3] = w3;			\
+}
+
+#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
+
+#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
+
+#define	BP_ZERO(bp)				\
+{						\
+	(bp)->blk_dva[0].dva_word[0] = 0;	\
+	(bp)->blk_dva[0].dva_word[1] = 0;	\
+	(bp)->blk_dva[1].dva_word[0] = 0;	\
+	(bp)->blk_dva[1].dva_word[1] = 0;	\
+	(bp)->blk_dva[2].dva_word[0] = 0;	\
+	(bp)->blk_dva[2].dva_word[1] = 0;	\
+	(bp)->blk_prop = 0;			\
+	(bp)->blk_pad[0] = 0;			\
+	(bp)->blk_pad[1] = 0;			\
+	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_birth = 0;			\
+	(bp)->blk_fill = 0;			\
+	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define	ZFS_HOST_BYTEORDER	(0ULL)
+#else
+#define	ZFS_HOST_BYTEORDER	(-1ULL)
+#endif
+
+#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#include <sys/dmu.h>
+
+/*
+ * Routines found in spa.c
+ */
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
+extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool);
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+    int replacing);
+extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
+    int replace_done);
+extern void spa_vdev_replace_done(spa_t *spa);
+
+/* scrubbing */
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
+extern void spa_scrub_suspend(spa_t *spa);
+extern void spa_scrub_resume(spa_t *spa);
+extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+extern void spa_config_sync(void);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int getstats);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+/* Pool configuration lock */
+extern void spa_config_enter(spa_t *spa, krw_t rw);
+extern void spa_config_exit(spa_t *spa);
+extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Accessor functions */
+extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
+extern int spa_traverse_wanted(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern int spa_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+struct metaslab_class;
+extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
+extern uint64_t spa_get_alloc(spa_t *spa);
+extern uint64_t spa_get_space(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern int spa_busy(void);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern void sprintf_blkptr(char *buf, blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_evict_all(void);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_bp(bp, fmt, ...) do {			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __blkbuf[200];				\
+	sprintf_blkptr(__blkbuf, (bp));			\
+	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);	\
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPA_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 0000000000..0fcef6c48b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define	_SYS_SPA_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_config_lock {
+	kmutex_t	scl_lock;
+	uint64_t	scl_count;
+	kthread_t	*scl_writer;
+	kcondvar_t	scl_cv;
+} spa_config_lock_t;
+
+struct spa {
+	/*
+	 * Fields protected by spa_namespace_lock.
+	 */
+	char		*spa_name;
+	avl_node_t	spa_avl;
+	int		spa_anon;
+	nvlist_t	*spa_config;
+	uint64_t	spa_config_txg;		/* txg of last config change */
+	spa_config_lock_t spa_config_lock;	/* configuration changes */
+	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
+	int		spa_sync_pass;		/* iterate-to-convergence */
+	int		spa_state;		/* pool state */
+	uint8_t		spa_minref;		/* min refcnt of open pool */
+	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
+	taskq_t		*spa_vdev_retry_taskq;
+	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
+	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
+	dsl_pool_t	*spa_dsl_pool;
+	metaslab_class_t *spa_normal_class;	/* normal data class */
+	uint64_t	spa_first_txg;		/* first txg after spa_open() */
+	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
+	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
+	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
+	vdev_t		*spa_root_vdev;		/* top-level vdev container */
+	list_t		spa_dirty_list;		/* vdevs with dirty labels */
+	uint64_t	spa_config_object;	/* MOS object for pool config */
+	uint64_t	spa_syncing_txg;	/* txg currently syncing */
+	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
+	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
+	krwlock_t	spa_traverse_lock;	/* traverse vs. spa_sync() */
+	uberblock_t	spa_ubsync;		/* last synced uberblock */
+	uberblock_t	spa_uberblock;		/* current uberblock */
+	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
+	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
+	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
+	uint64_t	spa_scrub_restart_txg;	/* need to restart */
+	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
+	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
+	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
+	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
+	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
+	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
+	uint8_t		spa_scrub_suspend;	/* tell scrubber to suspend */
+	uint8_t		spa_scrub_active;	/* active or suspended? */
+	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
+	int		spa_sync_on;		/* sync threads are running */
+	char		*spa_root;		/* alternate root directory */
+	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
+	/*
+	 * spa_refcnt must be the last element because it changes size based on
+	 * compilation options.  In order for the MDB module to function
+	 * correctly, the other fields must remain in the same location.
+	 */
+	refcount_t	spa_refcount;		/* number of opens */
+};
+
+extern const char *spa_config_dir;
+extern kmutex_t spa_namespace_lock;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPA_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 0000000000..9f0cf83c9a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define	_SYS_SPACE_MAP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map {
+	avl_tree_t	sm_root;	/* Root of the AVL tree */
+	uint64_t	sm_start;	/* Start of map (inclusive) */
+	uint64_t	sm_end;		/* End of map (exclusive) */
+	uint64_t	sm_size;	/* Size of map (end - start) */
+	uint64_t	sm_shift;	/* Unit shift */
+	uint64_t	sm_space;	/* Sum of all segments in the map */
+	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+	avl_node_t	ss_node;	/* AVL node */
+	uint64_t	ss_start;	/* starting offset of this segment */
+	uint64_t	ss_end;		/* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_map_obj {
+	uint64_t	smo_object;	/* on-disk space map object */
+	uint64_t	smo_objsize;	/* size of the object */
+	uint64_t	smo_alloc;	/* space allocated from the map */
+} space_map_obj_t;
+
+/*
+ * debug entry
+ *
+ *    1      3         10                     50
+ *  ,---+--------+------------+---------------------------------.
+ *  | 1 | action |  syncpass  |        txg (lower bits)         |
+ *  `---+--------+------------+---------------------------------'
+ *   63  62    60 59        50 49                               0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ *    1               47                   1           15
+ *  ,-----------------------------------------------------------.
+ *  | 0 |   offset (sm_shift units)    | type |       run       |
+ *  `-----------------------------------------------------------'
+ *   63  62                          17   16   15               0
+ */
+
+/* All this stuff takes and returns bytes */
+#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, 15) + 1)
+#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, 15)
+#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
+#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
+#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, 47)
+#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, 47)
+#define	SM_DEBUG_DECODE(x)	BF64_DECODE(x, 63, 1)
+#define	SM_DEBUG_ENCODE(x)	BF64_ENCODE(x, 63, 1)
+
+#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 3)
+#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 3)
+
+#define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
+#define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
+
+#define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
+#define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
+
+#define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
+
+#define	SM_ALLOC	0x0
+#define	SM_FREE		0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define	SPACE_MAP_BLOCKSHIFT	12
+
+#define	SPACE_MAP_CHUNKSIZE	(1<<20)
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+    uint64_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+    space_map_func_t *func, space_map_t *mdest);
+extern void space_map_iterate(space_map_t *sm,
+    space_map_func_t *func, space_map_t *mdest);
+extern void space_map_merge(space_map_t *dest, space_map_t *src);
+extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_union(space_map_t *smd, space_map_t *sms);
+
+extern int space_map_load(space_map_t *sm, space_map_obj_t *smo,
+    uint8_t maptype, objset_t *os, uint64_t end, uint64_t space);
+extern void space_map_sync(space_map_t *sm, space_map_t *dest,
+    space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx);
+extern void space_map_write(space_map_t *sm, space_map_obj_t *smo,
+    objset_t *os, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPACE_MAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 0000000000..dae129c2e5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define	_SYS_TXG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	TXG_CONCURRENT_STATES	3	/* open, quiescing, syncing	*/
+#define	TXG_SIZE		4		/* next power of 2	*/
+#define	TXG_MASK		(TXG_SIZE - 1)	/* mask for size	*/
+#define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
+#define	TXG_IDX			(txg & TXG_MASK)
+
+#define	TXG_WAIT		1ULL
+#define	TXG_NOWAIT		2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+	tx_cpu_t	*th_cpu;
+	uint64_t	th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+	struct txg_node	*tn_next[TXG_SIZE];
+	uint8_t		tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+	kmutex_t	tl_lock;
+	size_t		tl_offset;
+	txg_node_t	*tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_suspend(struct dsl_pool *dp);
+extern void txg_resume(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately).  If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group.  Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern int txg_stalled(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define	TXG_CLEAN(txg)	((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_TXG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 0000000000..45a138afaa
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define	_SYS_TXG_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+	kmutex_t	tc_lock;
+	kcondvar_t	tc_cv[TXG_SIZE];
+	uint64_t	tc_count[TXG_SIZE];
+	char		tc_pad[16];
+};
+
+typedef struct tx_state {
+	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
+	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
+	krwlock_t	tx_suspend;
+	uint64_t	tx_open_txg;	/* currently open txg id */
+	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
+	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
+	uint64_t	tx_synced_txg;	/* last synced txg id */
+
+	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
+	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+	kcondvar_t	tx_sync_more_cv;
+	kcondvar_t	tx_sync_done_cv;
+	kcondvar_t	tx_quiesce_more_cv;
+	kcondvar_t	tx_quiesce_done_cv;
+	kcondvar_t	tx_timeout_exit_cv;
+	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
+
+	uint8_t		tx_threads;	/* number of threads */
+	uint8_t		tx_exiting;	/* set when we're exiting */
+
+	kthread_t	*tx_sync_thread;
+	kthread_t	*tx_quiesce_thread;
+	kthread_t	*tx_timelimit_thread;
+} tx_state_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_TXG_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock.h b/usr/src/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 0000000000..93d936ae4b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define	_SYS_UBERBLOCK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_UBERBLOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 0000000000..5bfcea097d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define	_SYS_UBERBLOCK_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/uberblock.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved.  When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked.  If the ub_version field is moved, we may not detect
+ * version mismatch.  If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+
+#define	UBERBLOCK_SHIFT		(10)
+#define	UBERBLOCK_SIZE		(1ULL << UBERBLOCK_SHIFT)
+
+#define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
+
+#define	UBERBLOCK_VERSION	1ULL
+
+struct uberblock {
+	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
+	uint64_t	ub_version;	/* UBERBLOCK_VERSION		*/
+	uint64_t	ub_txg;		/* txg of last sync		*/
+	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
+	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
+	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
+};
+
+typedef struct uberblock_phys {
+	uberblock_t	ubp_uberblock;
+	char		ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
+	    sizeof (zio_block_tail_t)];
+	zio_block_tail_t ubp_zbt;
+} uberblock_phys_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_UBERBLOCK_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 0000000000..c8c177e3ca
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_UNIQUE_H
+#define	_SYS_UNIQUE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define	UNIQUE_BITS	56
+
+void unique_init(void);
+
+/* Return a new unique value. */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 0000000000..4113ff2ca6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_H
+#define	_SYS_VDEV_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Vdev knobs.
+ */
+typedef struct vdev_knob {
+	char		*vk_name;		/* knob name		*/
+	char		*vk_desc;		/* knob description	*/
+	uint64_t	vk_min;			/* minimum legal value	*/
+	uint64_t	vk_max;			/* maximum legal value	*/
+	uint64_t	vk_default;		/* default value	*/
+	size_t		vk_offset;		/* offset into vdev_t	*/
+} vdev_knob_t;
+
+/*
+ * Fault injection modes.
+ */
+#define	VDEV_FAULT_NONE		0
+#define	VDEV_FAULT_RANDOM	1
+#define	VDEV_FAULT_COUNT	2
+
+extern int vdev_open(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg);
+extern void vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *, zio_t **zq);
+
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
+extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+    int scrub_done);
+
+extern const char *vdev_description(vdev_t *vd);
+
+extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_stat_update(zio_t *zio);
+extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
+    boolean_t complete);
+extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
+extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
+    uint64_t alloc_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern void vdev_io_start(zio_t *zio);
+extern void vdev_io_done(zio_t *zio);
+
+extern int vdev_online(spa_t *spa, const char *path);
+extern int vdev_offline(spa_t *spa, const char *path);
+
+extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
+    uint64_t arg);
+extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
+extern int vdev_is_dead(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+
+extern nvlist_t *vdev_config_generate(vdev_t *vd, int getstats);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+int vdev_label_init(vdev_t *vd, uint64_t create_txg);
+extern int spa_sync_labels(spa_t *spa, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 0000000000..95536a77db
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define	_SYS_VDEV_DISK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+	ddi_devid_t	vd_devid;
+	char		*vd_minor;
+	ldi_handle_t	vd_lh;
+} vdev_disk_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_DISK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_file.h b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 0000000000..cd49673577
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define	_SYS_VDEV_FILE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+	vnode_t		*vf_vnode;
+} vdev_file_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_FILE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 0000000000..4ae3467619
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define	_SYS_VDEV_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void	vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void	vdev_io_start_func_t(zio_t *zio);
+typedef void	vdev_io_done_func_t(zio_t *zio);
+typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
+
+typedef struct vdev_ops {
+	vdev_open_func_t		*vdev_op_open;
+	vdev_close_func_t		*vdev_op_close;
+	vdev_asize_func_t		*vdev_op_asize;
+	vdev_io_start_func_t		*vdev_op_io_start;
+	vdev_io_done_func_t		*vdev_op_io_done;
+	vdev_state_change_func_t	*vdev_op_state_change;
+	char				vdev_op_type[16];
+	boolean_t			vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+	char		*ve_data;
+	uint64_t	ve_offset;
+	uint64_t	ve_lastused;
+	avl_node_t	ve_offset_node;
+	avl_node_t	ve_lastused_node;
+	uint32_t	ve_hits;
+	uint16_t	ve_missed_update;
+	zio_t		*ve_fill_io;
+};
+
+struct vdev_cache {
+	uint64_t	vc_size;
+	uint64_t	vc_bshift;
+	uint64_t	vc_blocksize;
+	uint64_t	vc_max;
+	avl_tree_t	vc_offset_tree;
+	avl_tree_t	vc_lastused_tree;
+	kmutex_t	vc_lock;
+};
+
+struct vdev_queue {
+	uint64_t	vq_min_pending;
+	uint64_t	vq_max_pending;
+	uint64_t	vq_agg_limit;
+	uint64_t	vq_time_shift;
+	uint64_t	vq_ramp_rate;
+	avl_tree_t	vq_deadline_tree;
+	avl_tree_t	vq_read_tree;
+	avl_tree_t	vq_write_tree;
+	avl_tree_t	vq_pending_tree;
+	kmutex_t	vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+	/*
+	 * Common to all vdev types.
+	 */
+	uint64_t	vdev_id;	/* child number in vdev parent	*/
+	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
+	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
+	uint64_t	vdev_asize;	/* allocatable device capacity	*/
+	uint64_t	vdev_ashift;	/* block alignment shift	*/
+	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
+	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
+	spa_t		*vdev_spa;	/* spa for this vdev		*/
+	void		*vdev_tsd;	/* type-specific data		*/
+	vdev_t		*vdev_top;	/* top-level vdev		*/
+	vdev_t		*vdev_parent;	/* parent vdev			*/
+	vdev_t		**vdev_child;	/* array of children		*/
+	uint64_t	vdev_children;	/* number of children		*/
+	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
+	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
+	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
+
+	/*
+	 * Top-level vdev state.
+	 */
+	uint64_t	vdev_ms_array;	/* metaslab array object	*/
+	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
+	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
+	metaslab_group_t *vdev_mg;	/* metaslab group		*/
+	metaslab_t	**vdev_ms;	/* metaslab array		*/
+	space_map_obj_t	*vdev_smo;	/* metaslab space map array	*/
+	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
+	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
+	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
+	uint8_t		vdev_dirty[TXG_SIZE]; /* per-txg dirty flags	*/
+	int		vdev_is_dirty;	/* on config dirty list?	*/
+	list_node_t	vdev_dirty_node; /* config dirty list		*/
+	zio_t		*vdev_io_retry;	/* I/O retry list		*/
+	list_t		vdev_io_pending; /* I/O pending list		*/
+
+	/*
+	 * Leaf vdev state.
+	 */
+	uint64_t	vdev_psize;	/* physical device capacity	*/
+	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
+	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
+	char		*vdev_path;	/* vdev path (if any)		*/
+	char		*vdev_devid;	/* vdev devid (if any)		*/
+	uint64_t	vdev_fault_arg; /* fault injection paramater	*/
+	int		vdev_fault_mask; /* zio types to fault		*/
+	uint8_t		vdev_fault_mode; /* fault injection mode	*/
+	uint8_t		vdev_cache_active; /* vdev_cache and vdev_queue	*/
+	uint8_t		vdev_offline;	/* device taken offline?	*/
+	uint8_t		vdev_detached;	/* device detached?		*/
+	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
+	vdev_cache_t	vdev_cache;	/* physical block cache		*/
+
+	/*
+	 * For DTrace to work in userland (libzpool) context, these fields must
+	 * remain at the end of the structure.  DTrace will use the kernel's
+	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+	 * larger in userland, the offsets for the rest fields would be
+	 * incorrect.
+	 */
+	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
+	kmutex_t	vdev_dirty_lock; /* vdev_dirty[]		*/
+	kmutex_t	vdev_io_lock;	/* vdev_io_pending list		*/
+	kcondvar_t	vdev_io_cv;	/* vdev_io_pending list empty?	*/
+	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
+};
+
+#define	VDEV_SKIP_SIZE		(8 << 10)
+#define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
+#define	VDEV_PHYS_SIZE		(112 << 10)
+#define	VDEV_UBERBLOCKS		((128 << 10) >> UBERBLOCK_SHIFT)
+
+#define	VDEV_BOOT_MAGIC		0x2f5b007b10c	/* ZFS boot block	*/
+#define	VDEV_BOOT_VERSION	1		/* version number	*/
+
+typedef struct vdev_boot_header {
+	uint64_t	vb_magic;		/* VDEV_BOOT_MAGIC	*/
+	uint64_t	vb_version;		/* VDEV_BOOT_VERSION	*/
+	uint64_t	vb_offset;		/* start offset	(bytes) */
+	uint64_t	vb_size;		/* size (bytes)		*/
+	char		vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+	zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+	char			vl_pad[VDEV_SKIP_SIZE];		/*   8K	*/
+	vdev_boot_header_t	vl_boot_header;			/*   8K	*/
+	vdev_phys_t		vl_vdev_phys;			/* 120K	*/
+	uberblock_phys_t	vl_uberblock[VDEV_UBERBLOCKS];	/* 128K	*/
+} vdev_label_t;							/* 256K total */
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
+#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M	*/
+
+/*
+ * vdev_dirty[] flags
+ */
+#define	VDD_ALLOC	0x01	/* allocated from in this txg		*/
+#define	VDD_FREE	0x02	/* freed to in this txg			*/
+#define	VDD_ADD		0x04	/* added to the pool in this txg	*/
+#define	VDD_DTL		0x08	/* dirty time log entry in this txg	*/
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
+#define	VDEV_LABELS		4
+
+#define	VDEV_ALLOC_LOAD		0
+#define	VDEV_ALLOC_ADD		1
+
+/*
+ * Allocate or free a vdev
+ */
+extern vdev_t *vdev_alloc(spa_t *spa, nvlist_t *config, vdev_t *parent,
+    uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern int vdev_load(vdev_t *vd, int import);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+extern vdev_ops_t vdev_missing_ops;
+
+/*
+ * Common asize function
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 0000000000..94ad0ffebe
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,353 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_H
+#define	_SYS_ZAP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects.  Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs.  The name is
+ * a zero-terminated string of up to 256 bytes (including terminating
+ * NULL).  The value is an array of integers (whose length is limited
+ * only by the size of the zapobj).  The integers may be 1, 2, 4, or 8
+ * bytes long.  Note that an 8-byte integer value can be used to store
+ * the location (object number) of another dmu object (which may be
+ * itself a zapobj).  Note that you can use a zero-length attribute to
+ * store a single bit of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe.  However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (23 bytes or less) names and short (23 bytes or less) values.
+ * The ZAP should be efficient enough so that the user does not need to
+ * cache these attributes.
+ *
+ * Using extremely long (~256 bytes or more) attribute names or values
+ * values will result in poor performance, due to the memcpy from the
+ * user's buffer into the ZAP object.  This penalty can be avoided by
+ * creating an integer-type attribute to store an object number, and
+ * accessing that object using the DMU directly.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe.  Operations
+ * on different zapobjs will be processed concurrently.  Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 32 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length.  If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0.  * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'.  If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value.  If an
+ * attribute with the given name does not exist, it will be created.  If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten.  The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+
+/*
+ * Returns (in name) the name of the entry whose value
+ * (za_first_integer) is value, or ENOENT if not found.  The string
+ * pointed to by name must be at least 256 bytes long.
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+
+typedef struct zap_cursor {
+	/* This structure is opaque! */
+	objset_t *zc_objset;
+	uint64_t zc_zapobj;
+	uint64_t zc_hash;
+	uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+	int za_integer_length;
+	uint64_t za_num_integers;
+	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
+	char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one.  The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+
+/*
+ * Get the attribute currently pointed to by the cursor.  Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor.  The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value.  The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument).  You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+    uint64_t zapobj, uint64_t serialized);
+
+
+#define	ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+	/*
+	 * Size of the pointer table (in number of entries).
+	 * This is always a power of 2, or zero if it's a microzap.
+	 * In general, it should be considerably greater than zs_num_leafs.
+	 */
+	uint64_t zs_ptrtbl_len;
+
+	uint64_t zs_blocksize;		/* size of zap blocks */
+
+	uint64_t zs_num_leafs;		/* The number of leaf blocks */
+
+	uint64_t zs_num_entries;	/* The number of zap entries */
+
+	/*
+	 * The number of blocks used.  Note that some blocks may be
+	 * wasted because old ptrtbl's and large name/value blocks are
+	 * not reused.  (Although their space is reclaimed, we don't
+	 * reuse those offsets in the object.)
+	 */
+	uint64_t zs_num_blocks;
+
+	/* The number of blocks used for large names or values */
+	uint64_t zs_num_blocks_large;
+
+	/*
+	 * Histograms.  For all histograms, the last index
+	 * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+	 * than what can be represented.  For example
+	 * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+	 * of leafs with more than 45 entries.
+	 */
+
+	/*
+	 * zs_leafs_with_n_pointers[n] is the number of leafs with
+	 * 2^n pointers to it.
+	 */
+	uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_with_n_chained[n] is the number of leafs with n
+	 * chained blocks.  zs_leafs_with_n_chained[0] (leafs with no
+	 * chained blocks) should be very close to zs_num_leafs.
+	 */
+	uint64_t zs_leafs_with_n_chained[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_with_n_entries[n] is the number of leafs with
+	 * [n*5, (n+1)*5) entries.  In the current implementation, there
+	 * can be at most 55 entries in any block, but there may be
+	 * fewer if the name or value is large, or the block is not
+	 * completely full.
+	 */
+	uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_n_tenths_full[n] is the number of leafs whose
+	 * fullness is in the range [n/10, (n+1)/10).
+	 */
+	uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_entries_using_n_chunks[n] is the number of entries which
+	 * consume n 24-byte chunks.  (Note, large names/values only use
+	 * one chunk, but contribute to zs_num_blocks_large.)
+	 */
+	uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_buckets_with_n_entries[n] is the number of buckets (each
+	 * leaf has 64 buckets) with n entries.
+	 * zs_buckets_with_n_entries[1] should be very close to
+	 * zs_num_entries.
+	 */
+	uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object.  Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics.  This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 0000000000..6593e20a14
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_IMPL_H
+#define	_SYS_ZAP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZAP_MAGIC 0x2F52AB2AB
+
+#define	ZAP_BLOCK_SHIFT		17
+
+#define	ZAP_MAXCD		(uint32_t)(-1)
+#define	ZAP_HASHBITS		28
+#define	MZAP_ENT_LEN		64
+#define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
+#define	MZAP_MAX_BLKSHIFT	ZAP_BLOCK_SHIFT
+#define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+	uint64_t mze_value;
+	uint32_t mze_cd;
+	uint16_t mze_pad;	/* in case we want to chain them someday */
+	char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+	uint64_t mz_block_type;	/* ZBT_MICRO */
+	uint64_t mz_salt;
+	uint64_t mz_pad[6];
+	mzap_ent_phys_t mz_chunk[1];
+	/* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+	avl_node_t mze_node;
+	int mze_chunkid;
+	uint64_t mze_hash;
+	mzap_ent_phys_t mze_phys;
+} mzap_ent_t;
+
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<ZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define	ZBT_LEAF		((1ULL << 63) + 0)
+#define	ZBT_HEADER		((1ULL << 63) + 1)
+#define	ZBT_MICRO		((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/* 1/2 the block size */
+#define	ZAP_PTRTBL_MIN_SHIFT (ZAP_BLOCK_SHIFT - 3 - 1)
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+	uint64_t zap_block_type;	/* ZBT_HEADER */
+	uint64_t zap_magic;		/* ZAP_MAGIC */
+
+	struct zap_table_phys {
+		uint64_t zt_blk;	/* starting block number */
+		uint64_t zt_numblks;	/* number of blocks */
+		uint64_t zt_shift;	/* bits to index it */
+		uint64_t zt_nextblk;	/* next (larger) copy start block */
+		uint64_t zt_blks_copied; /* number source blocks copied */
+	} zap_ptrtbl;
+
+	uint64_t zap_freeblk;		/* the next free block */
+	uint64_t zap_num_leafs;		/* number of leafs */
+	uint64_t zap_num_entries;	/* number of entries */
+	uint64_t zap_salt;		/* salt to stir into hash function */
+	uint64_t zap_pad[8181];
+	uint64_t zap_leafs[1 << ZAP_PTRTBL_MIN_SHIFT];
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+	objset_t *zap_objset;
+	uint64_t zap_object;
+	struct dmu_buf *zap_dbuf;
+	krwlock_t zap_rwlock;
+	int zap_ismicro;
+	uint64_t zap_salt;
+	union {
+		struct {
+			zap_phys_t *zap_phys;
+
+			/*
+			 * zap_num_entries_mtx protects
+			 * zap_num_entries
+			 */
+			kmutex_t zap_num_entries_mtx;
+		} zap_fat;
+		struct {
+			mzap_phys_t *zap_phys;
+			int16_t zap_num_entries;
+			int16_t zap_num_chunks;
+			int16_t zap_alloc_next;
+			avl_tree_t zap_avl;
+		} zap_micro;
+	} zap_u;
+} zap_t;
+
+#define	zap_f	zap_u.zap_fat
+#define	zap_m	zap_u.zap_micro
+
+uint64_t zap_hash(zap_t *zap, const char *name);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, int fatreader, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_pageout(dmu_buf_t *db, void *vmzap);
+
+void zap_print(zap_t *);
+struct zap_leaf *zap_create_leaf(zap_t *zd, dmu_tx_t *tx);
+void zap_destroy_leaf(zap_t *zap, struct zap_leaf *l, dmu_tx_t *tx);
+uint64_t zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx);
+
+#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int fzap_add(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
+int fzap_update(zap_t *zap, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_t *zap, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+
+int fzap_add_cd(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 0000000000..aee70ae633
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_LEAF_H
+#define	_SYS_ZAP_LEAF_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct zap;
+
+#define	ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+
+#define	ZAP_LEAF_NUMCHUNKS 5118
+#define	ZAP_LEAF_ARRAY_BYTES 21
+#define	ZAP_LEAF_HASH_SHIFT 12
+#define	ZAP_LEAF_HASH_NUMENTRIES (1 << ZAP_LEAF_HASH_SHIFT)
+#define	ZAP_LLA_DATA_BYTES ((1 << ZAP_BLOCK_SHIFT) - 16)
+
+typedef enum zap_entry_type {
+	ZAP_LEAF_FREE = 253,
+	ZAP_LEAF_ENTRY = 252,
+	ZAP_LEAF_ARRAY = 251,
+	ZAP_LEAF_TYPE_MAX = 250
+} zap_entry_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+	struct zap_leaf_header {
+		uint64_t lhr_block_type;	/* ZBT_LEAF */
+		uint64_t lhr_next;		/* next block in leaf chain */
+		uint64_t lhr_prefix;
+		uint32_t lhr_magic;		/* ZAP_LEAF_MAGIC */
+		uint16_t lhr_nfree;		/* number free chunks */
+		uint16_t lhr_nentries;		/* number of entries */
+		uint16_t lhr_prefix_len;
+
+#define	lh_block_type 	l_phys->l_hdr.lhr_block_type
+#define	lh_magic 	l_phys->l_hdr.lhr_magic
+#define	lh_next 	l_phys->l_hdr.lhr_next
+#define	lh_prefix 	l_phys->l_hdr.lhr_prefix
+#define	lh_nfree 	l_phys->l_hdr.lhr_nfree
+#define	lh_prefix_len 	l_phys->l_hdr.lhr_prefix_len
+#define	lh_nentries 	l_phys->l_hdr.lhr_nentries
+
+/* above is accessable to zap, below is zap_leaf private */
+
+		uint16_t lh_freelist;		/* chunk head of free list */
+		uint8_t lh_pad2[12];
+	} l_hdr; /* 2 24-byte chunks */
+
+	uint16_t l_hash[ZAP_LEAF_HASH_NUMENTRIES];
+	/* 170 24-byte chunks plus 16 bytes leftover space */
+
+	union zap_leaf_chunk {
+		struct zap_leaf_entry {
+			uint8_t le_type; 	/* always ZAP_LEAF_ENTRY */
+			uint8_t le_int_size;	/* size of ints */
+			uint16_t le_next;	/* next entry in hash chain */
+			uint16_t le_name_chunk;	/* first chunk of the name */
+			uint16_t le_name_length; /* bytes in name, incl null */
+			uint16_t le_value_chunk; /* first chunk of the value */
+			uint16_t le_value_length; /* value length in ints */
+			uint32_t le_cd;		/* collision differentiator */
+			uint64_t le_hash;	/* hash value of the name */
+		} l_entry;
+		struct zap_leaf_array {
+			uint8_t la_type;
+			uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+			uint16_t la_next;	/* next blk or CHAIN_END */
+		} l_array;
+		struct zap_leaf_free {
+			uint8_t lf_type;	/* always ZAP_LEAF_FREE */
+			uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+			uint16_t lf_next;  /* next in free list, or CHAIN_END */
+		} l_free;
+	} l_chunk[ZAP_LEAF_NUMCHUNKS];
+} zap_leaf_phys_t;
+
+typedef struct zap_leaf {
+	krwlock_t l_rwlock; 		/* only used on head of chain */
+	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
+	struct zap_leaf *l_next;	/* next in chain */
+	dmu_buf_t *l_dbuf;
+	zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+	/* below is set by zap_leaf.c and is public to zap.c */
+	uint64_t zeh_num_integers;
+	uint64_t zeh_hash;
+	uint32_t zeh_cd;
+	uint8_t zeh_integer_size;
+
+	/* below is private to zap_leaf.c */
+	uint16_t zeh_fakechunk;
+	uint16_t *zeh_chunkp;
+	zap_leaf_t *zeh_head_leaf;
+	zap_leaf_t *zeh_found_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found.  The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+	const char *name, uint64_t h, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+    uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute.  Integer size
+ * conversion will be done without sign extension.  Return EINVAL if
+ * integer_size is too small.  Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
+	uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value).  Fills in the
+ * entry handle on success.  Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l,
+	const char *name, uint64_t h, uint32_t cd,
+	uint8_t integer_size, uint64_t num_integers, const void *buf,
+	zap_entry_handle_t *zeh);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf);
+
+extern zap_leaf_t *zap_leaf_split(struct zap *zap, zap_leaf_t *l, dmu_tx_t *tx);
+
+extern int zap_leaf_merge(zap_leaf_t *l, zap_leaf_t *sibling);
+
+extern zap_leaf_t *zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl);
+
+extern int zap_leaf_advance(zap_leaf_t *l, zap_cursor_t *zc);
+
+extern void zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 0000000000..0050316eb5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_ACL_H
+#define	_SYS_FS_ZFS_ACL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define	ACCESS_UNDETERMINED	-1
+
+#define	ACE_SLOT_CNT	6
+
+typedef struct zfs_znode_acl {
+	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
+	uint32_t	z_acl_count;		  /* Number of ACEs */
+	uint16_t	z_acl_version;		  /* acl version */
+	uint16_t	z_acl_state;		  /* goop */
+	ace_t		z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+#define	ACL_DATA_ALLOCED	0x1
+
+/*
+ * Max ACL size is prepended deny for all entries + the
+ * canonical six tacked on * the end.
+ */
+#define	MAX_ACL_SIZE	(MAX_ACL_ENTRIES * 2 + 6)
+
+typedef struct zfs_acl {
+	int		z_slots;	/* number of allocated slots for ACEs */
+	int		z_acl_count;
+	uint_t		z_state;
+	ace_t		*z_acl;
+} zfs_acl_t;
+
+#define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define	DISCARD		0
+#define	NOALLOW		1
+#define	GROUPMASK	2
+#define	PASSTHROUGH	3
+#define	SECURE		4
+
+struct znode;
+
+#ifdef _KERNEL
+void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
+    dmu_tx_t *, cred_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
+int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t  *);
+int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+void zfs_acl_rele(void *);
+void zfs_ace_byteswap(ace_t *, int);
+extern int zfs_zaccess(struct znode *, int, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+    struct znode *, struct znode *, cred_t *cr);
+int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
+void zfs_acl_free(zfs_acl_t *);
+zfs_acl_t *zfs_acl_node_read(struct znode *);
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+#endif	/* _SYS_FS_ZFS_ACL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 0000000000..2f0e3e792d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define	_SYS_ZFS_CONTEXT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+
+#define	CPU_SEQID	(CPU->cpu_seqid)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 0000000000..78d82ccbe2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_CTLDIR_H
+#define	_ZFS_CTLDIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_CTLDIR_NAME		".zfs"
+
+#define	zfs_has_ctldir(zdp)	\
+	((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+	((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define	zfs_show_ctldir(zdp)	\
+	(zfs_has_ctldir(zdp) && \
+	((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr);
+
+int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen,
+    fid_t *fidp);
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define	ZFSCTL_INO_ROOT		0x1
+#define	ZFSCTL_INO_SNAPDIR	0x2
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_CTLDIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 0000000000..07eb3d2da8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define	_SYS_ZFS_DEBUG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define	TRUE 1
+#endif
+
+#ifndef FALSE
+#define	FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define	ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define	ZFS_DEBUG_DPRINTF	0x0001
+#define	ZFS_DEBUG_DBUF_VERIFY	0x0002
+#define	ZFS_DEBUG_DNODE_VERIFY	0x0004
+#define	ZFS_DEBUG_SNAPNAMES	0x0008
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+    int line, const char *fmt, ...);
+#define	dprintf(...) \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+		__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define	dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_DEBUG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 0000000000..8ab760f618
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_DIR_H
+#define	_SYS_FS_ZFS_DIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define	ZNEW		0x0001		/* entry should not exist */
+#define	ZEXISTS		0x0002		/* entry should exist */
+#define	ZSHARED		0x0004		/* shared access (zfs_dirlook()) */
+#define	ZXATTR		0x0008		/* we want the xattr dir */
+#define	ZRENAMING	0x0010		/* znode is being renamed */
+
+/* mknode flags */
+#define	IS_ROOT_NODE	0x01		/* create a root node */
+#define	IS_XATTR	0x02		/* create an extended attribute node */
+#define	IS_REPLAY	0x04		/* we are replaying intent log */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+    int);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, int *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **);
+extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
+    dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_dq_add(znode_t *, dmu_tx_t *);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 0000000000..cbe8bbc5cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZFS_IOCTL_H
+#define	_SYS_ZFS_IOCTL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_DRIVER_NAME "zfs"
+#define	ZFS_DS_TYPE	"zfs"
+
+/*
+ * Property values for snapdir
+ */
+#define	HIDDEN		0
+#define	VISIBLE		1
+
+typedef struct zfs_stats {
+	uint64_t	zs_atime;
+	uint64_t	zs_recordsize;
+	uint64_t	zs_readonly;
+	uint64_t	zs_devices;
+	uint64_t	zs_exec;
+	uint64_t	zs_setuid;
+	uint64_t	zs_snapdir;
+	uint64_t	zs_acl_mode;
+	uint64_t	zs_acl_inherit;
+	char		zs_mountpoint[MAXNAMELEN];
+	char		zs_atime_setpoint[MAXNAMELEN];
+	char		zs_recordsize_setpoint[MAXNAMELEN];
+	char		zs_readonly_setpoint[MAXNAMELEN];
+	char		zs_devices_setpoint[MAXNAMELEN];
+	char		zs_setuid_setpoint[MAXNAMELEN];
+	char		zs_exec_setpoint[MAXNAMELEN];
+	char		zs_mountpoint_setpoint[MAXNAMELEN];
+	char		zs_sharenfs[MAXNAMELEN];
+	char		zs_sharenfs_setpoint[MAXNAMELEN];
+	char		zs_snapdir_setpoint[MAXNAMELEN];
+	char		zs_acl_mode_setpoint[MAXNAMELEN];
+	char		zs_acl_inherit_setpoint[MAXNAMELEN];
+} zfs_stats_t;
+
+#define	DMU_BACKUP_VERSION (1ULL)
+#define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+	enum {
+		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+		DRR_WRITE, DRR_FREE, DRR_END,
+	} drr_type;
+	uint32_t drr_pad;
+	union {
+		struct drr_begin {
+			uint64_t drr_magic;
+			uint64_t drr_version;
+			uint64_t drr_creation_time;
+			dmu_objset_type_t drr_type;
+			uint32_t drr_pad;
+			uint64_t drr_toguid;
+			uint64_t drr_fromguid;
+			char drr_toname[MAXNAMELEN];
+		} drr_begin;
+		struct drr_end {
+			uint64_t drr_checksum;
+		} drr_end;
+		struct drr_object {
+			uint64_t drr_object;
+			dmu_object_type_t drr_type;
+			dmu_object_type_t drr_bonustype;
+			uint32_t drr_blksz;
+			uint32_t drr_bonuslen;
+			uint8_t drr_checksum;
+			uint8_t drr_compress;
+			uint8_t drr_pad[6];
+		} drr_object;
+		struct drr_freeobjects {
+			uint64_t drr_firstobj;
+			uint64_t drr_numobjs;
+		} drr_freeobjects;
+		struct drr_write {
+			uint64_t drr_object;
+			dmu_object_type_t drr_type;
+			uint32_t drr_pad;
+			uint64_t drr_offset;
+			uint64_t drr_length;
+		} drr_write;
+		struct drr_free {
+			uint64_t drr_object;
+			uint64_t drr_offset;
+			uint64_t drr_length;
+		} drr_free;
+	} drr_u;
+} dmu_replay_record_t;
+
+typedef struct zfs_cmd {
+	char		zc_name[MAXNAMELEN];
+	char		zc_prop_name[MAXNAMELEN];
+	char		zc_prop_value[MAXPATHLEN];
+	char		zc_root[MAXPATHLEN];
+	char		zc_filename[MAXPATHLEN];
+	uint32_t	zc_intsz;
+	uint32_t	zc_numints;
+	uint64_t	zc_pool_guid;
+	uint64_t	zc_config_src;	/* really (char *) */
+	uint64_t	zc_config_src_size;
+	uint64_t	zc_config_dst;	/* really (char *) */
+	uint64_t	zc_config_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_cred;
+	uint64_t	zc_dev;
+	uint64_t	zc_volsize;
+	uint64_t	zc_volblocksize;
+	uint64_t	zc_objset_type;
+	zfs_stats_t	zc_zfs_stats;
+	dmu_object_info_t zc_object_info;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+} zfs_cmd_t;
+
+#ifdef _KERNEL
+
+extern dev_info_t *zfs_dip;
+
+extern int zfs_secpolicy_write(const char *dataset, const char *, cred_t *cr);
+extern int zfs_busy(void);
+
+extern int zvol_check_volsize(zfs_cmd_t *zc);
+extern int zvol_check_volblocksize(zfs_cmd_t *zc);
+extern int zvol_get_stats(zfs_cmd_t *zc, objset_t *os);
+extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
+extern int zvol_create_minor(zfs_cmd_t *zc);
+extern int zvol_remove_minor(zfs_cmd_t *zc);
+extern int zvol_set_volsize(zfs_cmd_t *zc);
+extern int zvol_set_volblocksize(zfs_cmd_t *zc);
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+    int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_IOCTL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 0000000000..cd0700f641
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_VFSOPS_H
+#define	_SYS_FS_ZFS_VFSOPS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_delete_list {
+	kmutex_t		z_mutex;
+	kcondvar_t		z_cv;
+	kcondvar_t		z_quiesce_cv;
+	uint8_t			z_drained;
+	uint8_t			z_draining;
+	uint32_t		z_thread_target;
+	uint32_t		z_thread_count;
+	uint64_t		z_znode_count;
+	list_t			z_znodes;
+} zfs_delete_t;
+
+typedef struct zfsvfs zfsvfs_t;
+
+struct zfsvfs {
+	vfs_t		*z_vfs;		/* generic fs struct */
+	zfsvfs_t	*z_parent;	/* parent fs */
+	objset_t	*z_os;		/* objset reference */
+	uint64_t	z_root;		/* id of root znode */
+	uint64_t	z_dqueue;	/* delete queue */
+	uint64_t	z_max_blksz;	/* maximum block size for files */
+	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
+	zilog_t		*z_log;		/* intent log pointer */
+	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
+	uint_t		z_acl_inherit;	/* acl inheritance behavior */
+	boolean_t	z_atime;	/* enable atimes mount option */
+	boolean_t	z_unmounted1;	/* unmounted phase 1 */
+	boolean_t	z_unmounted2;	/* unmounted phase 2 */
+	uint32_t	z_op_cnt;	/* vnode/vfs operations ref count */
+	krwlock_t	z_um_lock;	/* rw lock for umount phase 2 */
+	zfs_delete_t 	z_delete_head;	/* zfs delete list */
+	list_t		z_all_znodes;	/* all vnodes in the fs */
+	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
+	vnode_t		*z_ctldir;	/* .zfs directory pointer */
+	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
+	boolean_t	z_issnap;	/* true if this is a snapshot */
+#define	ZFS_OBJ_MTX_SZ	64
+	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
+};
+
+/*
+ * The total file ID size is limited to 12 bytes (including the length
+ * field) in the NFSv2 protocol.  For historical reasons, this same limit
+ * is currently being imposed by the Solaris NFSv3 implementation...
+ * although the protocol actually permits a maximum of 64 bytes.  It will
+ * not be possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ *
+ * For the time being, we will partition up the available space as follows:
+ *	2 bytes		fid length (required)
+ *	6 bytes		object number (48 bits)
+ *	4 bytes		generation number (32 bits)
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+	uint16_t	zf_len;
+	uint8_t		zf_object[6];		/* obj[i] = obj >> (8 * i) */
+	uint8_t		zf_gen[4];		/* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+typedef struct zfid_long {
+	zfid_short_t	z_fid;
+	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
+	uint8_t		zf_setgen[4];		/* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
+#define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 0000000000..d3f28df4cd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,283 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZNODE_H
+#define	_SYS_FS_ZFS_ZNODE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define special zfs pflags
+ */
+#define	ZFS_XATTR	0x1		/* is an extended attribute */
+#define	ZFS_INHERIT_ACE	0x2		/* ace has inheritable ACEs */
+
+#define	MASTER_NODE_OBJ	1
+
+/*
+ * special attributes for master node.
+ */
+
+#define	ZFS_FSID		"FSID"
+#define	ZFS_DELETE_QUEUE	"DELETE_QUEUE"
+#define	ZFS_ROOT_OBJ		"ROOT"
+#define	ZFS_VERSION_OBJ		"VERSION"
+#define	ZFS_PROP_BLOCKPERPAGE	"BLOCKPERPAGE"
+#define	ZFS_PROP_NOGROWBLOCKS	"NOGROWBLOCKS"
+
+#define	ZFS_FLAG_BLOCKPERPAGE	0x1
+#define	ZFS_FLAG_NOGROWBLOCKS	0x2
+
+/*
+ * ZFS version - rev'd whenever an incompatible on-disk format change
+ * occurs.  Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define	ZFS_VERSION		1ULL
+
+#define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is.  Unfortunately,
+ * this length includes the terminating NULL.  ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
+
+/*
+ * This is the persistent portion of the znode.  It is stored
+ * in the "bonus buffer" of the file.  Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+	uint64_t zp_atime[2];		/*  0 - last file access time */
+	uint64_t zp_mtime[2];		/* 16 - last file modification time */
+	uint64_t zp_ctime[2];		/* 32 - last file change time */
+	uint64_t zp_crtime[2];		/* 48 - creation time */
+	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
+	uint64_t zp_mode;		/* 72 - file mode bits */
+	uint64_t zp_size;		/* 80 - size of file */
+	uint64_t zp_parent;		/* 88 - directory parent (`..') */
+	uint64_t zp_links;		/* 96 - number of links to file */
+	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
+	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
+	uint64_t zp_flags;		/* 120 - persistent flags */
+	uint64_t zp_uid;		/* 128 - file owner */
+	uint64_t zp_gid;		/* 136 - owning group */
+	uint64_t zp_pad[4];		/* 144 - future */
+	zfs_znode_acl_t zp_acl;		/* 176 - 263 ACL */
+	/*
+	 * Data may pad out any remaining bytes in the znode buffer, eg:
+	 *
+	 * |<---------------------- dnode_phys (512) ------------------------>|
+	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+	 *			|<---- znode (264) ---->|<---- data (56) ---->|
+	 *
+	 * At present, we only use this space to store symbolic links.
+	 */
+} znode_phys_t;
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+	char		*dl_name;	/* directory entry being locked */
+	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
+	uint16_t	dl_namesize;	/* set if dl_name was allocated */
+	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
+	struct znode	*dl_dzp;	/* directory znode */
+	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+struct zcache_state;
+
+typedef struct znode {
+	struct zfsvfs	*z_zfsvfs;
+	vnode_t		*z_vnode;
+	list_node_t 	z_list_node;	/* deleted znodes */
+	uint64_t	z_id;		/* object ID for this znode */
+	kmutex_t	z_lock;		/* znode modification lock */
+	krwlock_t	z_map_lock;	/* page map lock */
+	krwlock_t	z_grow_lock;	/* grow block size lock */
+	krwlock_t	z_append_lock;	/* append-mode lock */
+	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+	uint8_t		z_active;	/* znode is in use */
+	uint8_t		z_reap;		/* reap file at last reference */
+	uint8_t		z_atime_dirty;	/* atime needs to be synced */
+	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
+	uint_t		z_mapcnt;	/* number of memory maps to file */
+	uint_t		z_blksz;	/* block size in bytes */
+	uint_t		z_seq;		/* modification sequence number */
+	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
+	kmutex_t	z_acl_lock;	/* acl data lock */
+	list_node_t	z_link_node;	/* all znodes in fs link */
+	list_node_t	z_zcache_node;
+	struct zcache_state *z_zcache_state;
+	uint64_t	z_zcache_access;
+
+	/*
+	 * These are dmu managed fields.
+	 */
+	znode_phys_t	*z_phys;	/* pointer to persistent znode */
+	dmu_buf_t	*z_dbuf;	/* buffer containing the z_phys */
+} znode_t;
+
+/*
+ * The grow_lock is only applicable to "regular" files.
+ * The parent_lock is only applicable to directories.
+ */
+#define	z_parent_lock	z_grow_lock
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define	ZTOV(ZP)	((ZP)->z_vnode)
+#define	VTOZ(VP)	((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ */
+#define	ZFS_ENTER(zfsvfs) \
+	{ \
+		atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
+		if ((zfsvfs)->z_unmounted1) { \
+			ZFS_EXIT(zfsvfs); \
+			return (EIO); \
+		} \
+	}
+#define	ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define	ZFS_OBJ_HASH(obj_num)	(obj_num & (ZFS_OBJ_MTX_SZ - 1))
+#define	ZFS_OBJ_MUTEX(zp)	\
+	(&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+	mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
+
+#define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+	mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define	ZFS_TIME_ENCODE(tp, stmp)		\
+{						\
+	stmp[0] = (uint64_t)(tp)->tv_sec; 	\
+	stmp[1] = (uint64_t)(tp)->tv_nsec;	\
+}
+
+#define	ZFS_TIME_DECODE(tp, stmp)		\
+{						\
+	(tp)->tv_sec = (time_t)stmp[0];		\
+	(tp)->tv_nsec = (long)stmp[1];		\
+}
+
+/*
+ * Timestamp defines
+ */
+#define	ACCESSED		(AT_ATIME)
+#define	STATE_CHANGED		(AT_CTIME)
+#define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
+
+#define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+		zfs_time_stamper(zp, ACCESSED, NULL)
+
+extern int	zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern void	zfs_set_dataprop(objset_t *);
+extern void	zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void	zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
+extern void	zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern int	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
+    cred_t *cr);
+extern void	zfs_znode_init(void);
+extern void	zfs_znode_fini(void);
+extern znode_t	*zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
+extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern void	zfs_zinactive(znode_t *);
+extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void	zfs_znode_free(znode_t *);
+extern int	zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
+extern void	zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
+extern void	zfs_zcache_flush(zfsvfs_t *zfsvf);
+extern void	zfs_remove_op_tables();
+extern int	zfs_create_op_tables();
+extern int	zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
+
+extern uint64_t zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, char *name);
+extern uint64_t zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name, char *link);
+extern uint64_t zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern uint64_t zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio);
+extern uint64_t zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, uint64_t off, uint64_t len);
+extern uint64_t zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, vattr_t *vap, uint_t mask_applied);
+extern uint64_t zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, int aclcnt, ace_t *z_ace);
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 0000000000..a03dcc6bc9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZIL_H
+#define	_SYS_ZIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log.  The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset.  The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t).  The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log.  All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
+	uint64_t zh_replay_seq;	/* highest replayed sequence number */
+	blkptr_t zh_log;	/* log chain */
+	uint64_t zit_pad[6];
+} zil_header_t;
+
+/*
+ * Log block trailer - structure at the end of the header and each log block
+ *
+ * The zit_bt contains a zbt_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zbt_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_trailer {
+	uint64_t zit_pad;
+	blkptr_t zit_next_blk;	/* next block in chain */
+	uint64_t zit_nused;	/* bytes in log block used */
+	zio_block_tail_t zit_bt; /* block trailer */
+} zil_trailer_t;
+
+#define	ZIL_MIN_BLKSZ	4096
+#define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
+#define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define	TX_CREATE	1		/* Create file */
+#define	TX_MKDIR	2		/* Make directory */
+#define	TX_MKXATTR	3		/* Make XATTR directory */
+#define	TX_SYMLINK	4		/* Create symbolic link to a file */
+#define	TX_REMOVE	5		/* Remove file */
+#define	TX_RMDIR	6		/* Remove directory */
+#define	TX_LINK		7		/* Create hard link to a file */
+#define	TX_RENAME	8		/* Rename a file */
+#define	TX_WRITE	9		/* File write */
+#define	TX_TRUNCATE	10		/* Truncate a file */
+#define	TX_SETATTR	11		/* Set file attributes */
+#define	TX_ACL		12		/* Set acl */
+#define	TX_MAX_TYPE	13		/* Max transaction type */
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ */
+typedef struct {			/* common log record header */
+	uint64_t	lrc_txtype;	/* intent log transaction type */
+	uint64_t	lrc_reclen;	/* transaction record length */
+	uint64_t	lrc_txg;	/* dmu transaction group number */
+	uint64_t	lrc_seq;	/* intent log sequence number */
+} lr_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* object id of directory */
+	uint64_t	lr_foid;	/* object id of created file object */
+	uint64_t	lr_mode;	/* mode of object */
+	uint64_t	lr_uid;		/* uid of object */
+	uint64_t	lr_gid;		/* gid of object */
+	uint64_t	lr_gen;		/* generation (txg of creation) */
+	uint64_t	lr_crtime[2];	/* creation time */
+	uint64_t	lr_rdev;	/* rdev of object to create */
+	/* name of object to create follows this */
+	/* for symlinks, link content follows name */
+} lr_create_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* obj id of directory */
+	/* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* obj id of directory */
+	uint64_t	lr_link_obj;	/* obj id of link */
+	/* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_sdoid;	/* obj id of source directory */
+	uint64_t	lr_tdoid;	/* obj id of target directory */
+	/* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* file object to write */
+	uint64_t	lr_offset;	/* offset to write to */
+	uint64_t	lr_length;	/* user data length to write */
+	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
+	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
+	/* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* object id of file to truncate */
+	uint64_t	lr_offset;	/* offset to truncate from */
+	uint64_t	lr_length;	/* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* file object to change attributes */
+	uint64_t	lr_mask;	/* mask of attributes to set */
+	uint64_t	lr_mode;	/* mode to set */
+	uint64_t	lr_uid;		/* uid to set */
+	uint64_t	lr_gid;		/* gid to set */
+	uint64_t	lr_size;	/* size to set */
+	uint64_t	lr_atime[2];	/* access time */
+	uint64_t	lr_mtime[2];	/* modification time */
+} lr_setattr_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* obj id of file */
+	uint64_t	lr_aclcnt;	/* number of acl entries */
+	/* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * ZFS intent log transaction structure
+ */
+typedef struct itx {
+	list_node_t	itx_node;	/* linkage on zl_itx_list */
+	void		*itx_private;	/* type-specific opaque data */
+	uint8_t		itx_data_copied; /* TX_WRITE only: write data already */
+					/* copied into itx data buffer */
+	lr_t		itx_lr;		/* common part of log record */
+	/* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+    uint64_t txg);
+typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+    uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr);
+
+extern void	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void	zil_init(void);
+extern void	zil_fini(void);
+
+extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void	zil_free(zilog_t *zilog);
+
+extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void	zil_close(zilog_t *zilog);
+
+extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *));
+extern void	zil_destroy(zilog_t *zilog);
+
+extern itx_t	*zil_itx_create(int txtype, size_t lrsize);
+extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void	zil_commit(zilog_t *zilog, uint64_t seq, int ioflag);
+
+extern void	zil_claim(char *osname, void *txarg);
+extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void	zil_clean(zilog_t *zilog);
+
+extern int	zil_suspend(zilog_t *zilog);
+extern void	zil_resume(zilog_t *zilog);
+
+extern int zil_disable;
+extern int zil_always;
+extern int zil_purge;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 0000000000..6286fc5aa3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZIL_IMPL_H
+#define	_SYS_ZIL_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum lwb_state_type {
+	UNWRITTEN,	/* buffer yet to be written */
+	SEQ_INCOMPLETE,	/* buffer written, but there's an unwritten buffer in */
+			/* the sequence before this */
+	SEQ_COMPLETE,	/* no unwritten buffers before this */
+} lwb_state_t;
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+	zilog_t		*lwb_zilog;	/* back pointer to log struct */
+	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	int		lwb_nused;	/* # used bytes in buffer */
+	int		lwb_sz;		/* size of block and buffer */
+	char		*lwb_buf;	/* log write buffer */
+	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
+	uint64_t	lwb_seq;	/* highest log record seq number */
+	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
+	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
+	lwb_state_t	lwb_state;	/* buffer state */
+} lwb_t;
+
+/*
+ * [vdev, seq] element for use in flushing device write caches
+ */
+typedef struct zil_vdev {
+	uint64_t	vdev;		/* device written */
+	uint64_t	seq;		/* itx sequence */
+	list_node_t	vdev_seq_node;	/* zilog->zl_vdev_list linkage */
+} zil_vdev_t;
+
+/*
+ * Stable storage intent log management structure.  One per dataset.
+ */
+struct zilog {
+	kmutex_t	zl_lock;	/* protects most zilog_t fields */
+	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
+	spa_t		*zl_spa;	/* handle for read/write log */
+	zil_header_t	*zl_header;	/* log header buffer */
+	objset_t	*zl_os;		/* object set we're logging */
+	zil_get_data_t	*zl_get_data;	/* callback to get object content */
+	uint64_t	zl_itx_seq;	/* itx sequence number */
+	uint64_t	zl_ss_seq;	/* last tx on stable storage */
+	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
+	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint32_t	zl_suspend;	/* log suspend count */
+	kcondvar_t	zl_cv_write;	/* for waiting to write to log */
+	kcondvar_t	zl_cv_seq;	/* for committing a sequence */
+	uint8_t		zl_stop_replay;	/* don't replay any further */
+	uint8_t		zl_stop_sync;	/* for debugging */
+	uint8_t		zl_writer;	/* boolean: write setup in progress */
+	uint8_t		zl_log_error;	/* boolean: log write error */
+	list_t		zl_itx_list;	/* in-memory itx list */
+	uint64_t	zl_itx_list_sz;	/* total size of records on list */
+	uint64_t	zl_prev_blk_sz;	/* previous log block size */
+	list_t		zl_lwb_list;	/* in-flight log write list */
+	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
+	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
+	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
+	kmutex_t	zl_destroy_lock; /* serializes zil_destroy() calls */
+};
+
+typedef struct zil_dva_node {
+	dva_t		zn_dva;
+	avl_node_t	zn_node;
+} zil_dva_node_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIL_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 0000000000..5d3227e546
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define	_ZIO_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dkio.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
+
+typedef struct zio_block_tail {
+	uint64_t	zbt_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
+#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_block_tail_t) - \
+	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+	sizeof (uint64_t))
+
+#define	ZIO_GET_DVA(zio)	(&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
+#define	ZIO_GET_IOSIZE(zio)	\
+	(DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
+	uint64_t		zg_filler[SPA_GBH_FILLER];
+	zio_block_tail_t	zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+	ZIO_CHECKSUM_INHERIT = 0,
+	ZIO_CHECKSUM_ON,
+	ZIO_CHECKSUM_OFF,
+	ZIO_CHECKSUM_LABEL,
+	ZIO_CHECKSUM_GANG_HEADER,
+	ZIO_CHECKSUM_ZILOG,
+	ZIO_CHECKSUM_FLETCHER_2,
+	ZIO_CHECKSUM_FLETCHER_4,
+	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_2
+#define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
+
+enum zio_compress {
+	ZIO_COMPRESS_INHERIT = 0,
+	ZIO_COMPRESS_ON,
+	ZIO_COMPRESS_OFF,
+	ZIO_COMPRESS_LZJB,
+	ZIO_COMPRESS_FUNCTIONS
+};
+
+#define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
+#define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
+
+#define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
+#define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
+#define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
+#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[3])
+#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[4])
+#define	ZIO_PRIORITY_FREE		(zio_priority_table[5])
+#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[6])
+#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[7])
+#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[8])
+#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
+#define	ZIO_PRIORITY_TABLE_SIZE		10
+
+#define	ZIO_FLAG_MUSTSUCCEED		0x0000
+#define	ZIO_FLAG_CANFAIL		0x0001
+#define	ZIO_FLAG_FAILFAST		0x0002
+#define	ZIO_FLAG_CONFIG_HELD		0x0004
+
+#define	ZIO_FLAG_DONT_CACHE		0x0010
+#define	ZIO_FLAG_DONT_QUEUE		0x0020
+#define	ZIO_FLAG_DONT_PROPAGATE		0x0040
+#define	ZIO_FLAG_DONT_RETRY		0x0080
+
+#define	ZIO_FLAG_PHYSICAL		0x0100
+#define	ZIO_FLAG_IO_BYPASS		0x0200
+#define	ZIO_FLAG_IO_REPAIR		0x0400
+#define	ZIO_FLAG_SPECULATIVE		0x0800
+
+#define	ZIO_FLAG_RESILVER		0x1000
+#define	ZIO_FLAG_SCRUB			0x2000
+
+#define	ZIO_FLAG_GANG_INHERIT		\
+	(ZIO_FLAG_CANFAIL |		\
+	ZIO_FLAG_FAILFAST |		\
+	ZIO_FLAG_CONFIG_HELD |		\
+	ZIO_FLAG_DONT_RETRY |		\
+	ZIO_FLAG_IO_REPAIR |		\
+	ZIO_FLAG_SPECULATIVE |		\
+	ZIO_FLAG_RESILVER |		\
+	ZIO_FLAG_SCRUB)
+
+#define	ZIO_FLAG_VDEV_INHERIT		\
+	(ZIO_FLAG_GANG_INHERIT |	\
+	ZIO_FLAG_DONT_CACHE |		\
+	ZIO_FLAG_PHYSICAL)
+
+/*
+ * We'll take the unused errno 'EBADE' (from the Convergent graveyard)
+ * to indicate checksum errors.
+ */
+#define	ECKSUM	EBADE
+
+typedef struct zio zio_t;
+typedef void zio_done_func_t(zio_t *zio);
+typedef struct zio_transform zio_transform_t;
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+struct zio {
+	/* Core information about this I/O */
+	zio_t		*io_parent;
+	zio_t		*io_root;
+	spa_t		*io_spa;
+	int		io_checksum;
+	int		io_compress;
+	int		io_dva_index;
+	uint64_t	io_txg;
+	blkptr_t	*io_bp;
+	blkptr_t	io_bp_copy;
+	zio_t		*io_child;
+	zio_t		*io_sibling_prev;
+	zio_t		*io_sibling_next;
+	zio_transform_t *io_transform_stack;
+
+	/* Callback info */
+	zio_done_func_t	*io_done;
+	void		*io_private;
+	blkptr_t	io_bp_orig;
+
+	/* Data represented by this I/O */
+	void		*io_data;
+	uint64_t	io_size;
+
+	/* Stuff for the vdev stack */
+	vdev_t		*io_vd;
+	void		*io_vsd;
+	uint64_t	io_offset;
+	uint64_t	io_deadline;
+	uint64_t	io_timestamp;
+	avl_node_t	io_offset_node;
+	avl_node_t	io_deadline_node;
+	avl_tree_t	*io_vdev_tree;
+	zio_t		*io_delegate_list;
+	zio_t		*io_delegate_next;
+	zio_t		*io_retry_next;
+	list_node_t	io_pending;
+
+	/* Internal pipeline state */
+	int		io_flags;
+	uint8_t		io_type;
+	uint8_t		io_stage;
+	uint8_t		io_stalled;
+	uint8_t		io_priority;
+	struct dk_callback io_dk_callback;
+	int		io_cmd;
+	int		io_retries;
+	int		io_error;
+	uint32_t	io_numerrors;
+	uint32_t	io_pipeline;
+	uint32_t	io_async_stages;
+	uint64_t	io_children_notready;
+	uint64_t	io_children_notdone;
+	void		*io_waiter;
+	kmutex_t	io_lock;
+	kcondvar_t	io_cv;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+    zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_root(spa_t *spa,
+    zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size,
+    blkptr_t *bp, uint64_t txg);
+extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+extern void zio_next_stage(zio_t *zio);
+extern void zio_next_stage_async(zio_t *zio);
+extern void zio_wait_children_done(zio_t *zio);
+
+/*
+ * Delegate I/O to a child vdev.
+ */
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+    uint64_t offset, void *data, uint64_t size, int type, int priority,
+    int flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+
+extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
+extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 0000000000..ba3dc48d28
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define	_SYS_ZIO_CHECKSUM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
+	int		ci_correctable;	/* number of correctable bits	*/
+	int		ci_zbt;		/* uses zio block tail?	*/
+	char		*ci_name;	/* descriptive name */
+} zio_checksum_info_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t fletcher_2_native;
+extern zio_checksum_t fletcher_4_native;
+
+extern zio_checksum_t fletcher_2_byteswap;
+extern zio_checksum_t fletcher_4_byteswap;
+
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+    void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIO_CHECKSUM_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 0000000000..7eddf1e8d1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define	_SYS_ZIO_COMPRESS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+    size_t s_len, size_t d_len);
+typedef int zio_decompress_func_t(void *src, void *dst,
+    size_t s_len, size_t d_len);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+	zio_compress_func_t	*ci_compress;
+	zio_decompress_func_t	*ci_decompress;
+	char			*ci_name;
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
+    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
+extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+    void *dest, uint64_t destsize);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIO_COMPRESS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 0000000000..0b2b07de29
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define	_ZIO_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * I/O Groups: pipeline stage definitions.
+ */
+
+typedef enum zio_stage {
+	ZIO_STAGE_OPEN = 0,			/* RWFCI */
+	ZIO_STAGE_WAIT_CHILDREN_READY,		/* RWFCI */
+
+	ZIO_STAGE_WRITE_COMPRESS,		/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
+
+	ZIO_STAGE_GANG_PIPELINE,		/* -WFC- */
+
+	ZIO_STAGE_GET_GANG_HEADER,		/* -WFC- */
+	ZIO_STAGE_REWRITE_GANG_MEMBERS,		/* -W--- */
+	ZIO_STAGE_FREE_GANG_MEMBERS,		/* --F-- */
+	ZIO_STAGE_CLAIM_GANG_MEMBERS,		/* ---C- */
+
+	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
+	ZIO_STAGE_DVA_FREE,			/* --F-- */
+	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
+
+	ZIO_STAGE_GANG_CHECKSUM_GENERATE,	/* -W--- */
+
+	ZIO_STAGE_READY,			/* RWFCI */
+
+	ZIO_STAGE_DVA_TRANSLATE,		/* RW--- */
+
+	ZIO_STAGE_VDEV_IO_SETUP,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
+	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
+
+	ZIO_STAGE_WAIT_CHILDREN_DONE,		/* RWFCI */
+
+	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
+	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
+	ZIO_STAGE_READ_DECOMPRESS,		/* R---- */
+
+	ZIO_STAGE_DONE				/* RWFCI */
+} zio_stage_t;
+
+/*
+ * The stages for which there's some performance value in going async.
+ * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
+ */
+#define	ZIO_ASYNC_PIPELINE_STAGES				\
+	((1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
+	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+	(1U << ZIO_STAGE_READ_DECOMPRESS))
+
+#define	ZIO_VDEV_IO_PIPELINE					\
+	((1U << ZIO_STAGE_VDEV_IO_SETUP) |			\
+	(1U << ZIO_STAGE_VDEV_IO_START) |			\
+	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
+	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+
+#define	ZIO_READ_PHYS_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_READ_PIPELINE					\
+	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
+	ZIO_READ_PHYS_PIPELINE)
+
+#define	ZIO_WRITE_PHYS_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WRITE_COMMON_PIPELINE				\
+	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
+	ZIO_WRITE_PHYS_PIPELINE)
+
+#define	ZIO_WRITE_PIPELINE					\
+	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_GANG_STAGES						\
+	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
+	(1U << ZIO_STAGE_READ_GANG_MEMBERS))
+
+#define	ZIO_REWRITE_PIPELINE					\
+	((1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
+	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
+	((1U << ZIO_STAGE_DVA_ALLOCATE) |			\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_GANG_FREE_STAGES					\
+	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+
+#define	ZIO_FREE_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_DVA_FREE) |				\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_CLAIM_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_DVA_CLAIM) |				\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_IOCTL_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
+	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
+	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_VDEV_CHILD_PIPELINE					\
+	(ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE |			\
+	ZIO_VDEV_IO_PIPELINE)
+
+#define	ZIO_ERROR_PIPELINE_MASK					\
+	ZIO_WAIT_FOR_CHILDREN_PIPELINE
+
+struct zio_transform {
+	void		*zt_data;
+	uint64_t	zt_size;
+	uint64_t	zt_bufsize;
+	zio_transform_t	*zt_next;
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
new file mode 100644
index 0000000000..81ab16cd3d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -0,0 +1,583 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(dsl_pool_t *dp);
+static void txg_quiesce_thread(dsl_pool_t *dp);
+static void txg_timelimit_thread(dsl_pool_t *dp);
+
+int txg_time = 5;	/* max 5 seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	bzero(tx, sizeof (tx_state_t));
+
+	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+
+	tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(tx->tx_threads == 0);
+
+	rw_destroy(&tx->tx_suspend);
+
+	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+	bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+
+	dprintf("pool %p\n", dp);
+
+	ASSERT(tx->tx_threads == 0);
+
+	tx->tx_threads = 3;
+
+	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+	mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+	ASSERT(*tpp != NULL);
+	*tpp = NULL;
+	tx->tx_threads--;
+	cv_broadcast(&tx->tx_exit_cv);
+	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
+	thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+{
+	CALLB_CPR_SAFE_BEGIN(cpr);
+
+	if (secmax)
+		(void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + secmax * hz);
+	else
+		cv_wait(cv, &tx->tx_sync_lock);
+
+	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	dprintf("pool %p\n", dp);
+	/*
+	 * Finish off any work in progress.
+	 */
+	ASSERT(tx->tx_threads == 3);
+	txg_wait_synced(dp, 0);
+
+	/*
+	 * Wake all 3 sync threads (one per state) and wait for them to die.
+	 */
+	mutex_enter(&tx->tx_sync_lock);
+
+	ASSERT(tx->tx_threads == 3);
+
+	tx->tx_exiting = 1;
+
+	cv_broadcast(&tx->tx_quiesce_more_cv);
+	cv_broadcast(&tx->tx_quiesce_done_cv);
+	cv_broadcast(&tx->tx_sync_more_cv);
+	cv_broadcast(&tx->tx_timeout_exit_cv);
+
+	while (tx->tx_threads != 0)
+		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+	tx->tx_exiting = 0;
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+	uint64_t txg;
+
+	mutex_enter(&tc->tc_lock);
+
+	txg = tx->tx_open_txg;
+	tc->tc_count[txg & TXG_MASK]++;
+
+	th->th_cpu = tc;
+	th->th_txg = txg;
+
+	return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+
+	mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+	int g = th->th_txg & TXG_MASK;
+
+	mutex_enter(&tc->tc_lock);
+	ASSERT(tc->tc_count[g] != 0);
+	if (--tc->tc_count[g] == 0)
+		cv_broadcast(&tc->tc_cv[g]);
+	mutex_exit(&tc->tc_lock);
+
+	th->th_cpu = NULL;	/* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	int g = txg & TXG_MASK;
+	int c;
+
+	/*
+	 * Grab all tx_cpu locks so nobody else can get into this txg.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+	ASSERT(txg == tx->tx_open_txg);
+	tx->tx_open_txg++;
+
+	/*
+	 * Now that we've incremented tx_open_txg, we can let threads
+	 * enter the next transaction group.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+	/*
+	 * Quiesce the transaction group by waiting for everyone to txg_exit().
+	 */
+	for (c = 0; c < max_ncpus; c++) {
+		tx_cpu_t *tc = &tx->tx_cpu[c];
+		mutex_enter(&tc->tc_lock);
+		while (tc->tc_count[g] != 0)
+			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+		mutex_exit(&tc->tc_lock);
+	}
+}
+
+static void
+txg_sync_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	for (;;) {
+		uint64_t txg;
+
+		/*
+		 * We sync when there's someone waiting on us, or the
+		 * quiesce thread has handed off a txg to us.
+		 */
+		while (!tx->tx_exiting &&
+		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+		    tx->tx_quiesced_txg == 0) {
+			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+		}
+
+		/*
+		 * Wait until the quiesce thread hands off a txg to us,
+		 * prompting it to do so if necessary.
+		 */
+		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+			cv_broadcast(&tx->tx_quiesce_more_cv);
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+		}
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+		rw_enter(&tx->tx_suspend, RW_WRITER);
+
+		/*
+		 * Consume the quiesced txg which has been handed off to
+		 * us.  This may cause the quiescing thread to now be
+		 * able to quiesce another txg, so we must signal it.
+		 */
+		txg = tx->tx_quiesced_txg;
+		tx->tx_quiesced_txg = 0;
+		tx->tx_syncing_txg = txg;
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+		rw_exit(&tx->tx_suspend);
+
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+			txg, tx->tx_quiesce_txg_waiting,
+			tx->tx_sync_txg_waiting);
+		mutex_exit(&tx->tx_sync_lock);
+		spa_sync(dp->dp_spa, txg);
+		mutex_enter(&tx->tx_sync_lock);
+		rw_enter(&tx->tx_suspend, RW_WRITER);
+		tx->tx_synced_txg = txg;
+		tx->tx_syncing_txg = 0;
+		rw_exit(&tx->tx_suspend);
+		cv_broadcast(&tx->tx_sync_done_cv);
+	}
+}
+
+static void
+txg_quiesce_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	for (;;) {
+		uint64_t txg;
+
+		/*
+		 * We quiesce when there's someone waiting on us.
+		 * However, we can only have one txg in "quiescing" or
+		 * "quiesced, waiting to sync" state.  So we wait until
+		 * the "quiesced, waiting to sync" txg has been consumed
+		 * by the sync thread.
+		 */
+		while (!tx->tx_exiting &&
+		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+		    tx->tx_quiesced_txg != 0))
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+		txg = tx->tx_open_txg;
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+		    txg, tx->tx_quiesce_txg_waiting,
+		    tx->tx_sync_txg_waiting);
+		mutex_exit(&tx->tx_sync_lock);
+		txg_quiesce(dp, txg);
+		mutex_enter(&tx->tx_sync_lock);
+
+		/*
+		 * Hand this txg off to the sync thread.
+		 */
+		dprintf("quiesce done, handing off txg %llu\n", txg);
+		tx->tx_quiesced_txg = txg;
+		cv_broadcast(&tx->tx_sync_more_cv);
+		cv_broadcast(&tx->tx_quiesce_done_cv);
+	}
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT(tx->tx_threads == 3);
+	if (txg == 0)
+		txg = tx->tx_open_txg;
+	if (tx->tx_sync_txg_waiting < txg)
+		tx->tx_sync_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_synced_txg < txg) {
+		dprintf("broadcasting sync more "
+		    "tx_synced=%llu waiting=%llu dp=%p\n",
+		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+		cv_broadcast(&tx->tx_sync_more_cv);
+		cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT(tx->tx_threads == 3);
+	if (txg == 0)
+		txg = tx->tx_open_txg + 1;
+	if (tx->tx_quiesce_txg_waiting < txg)
+		tx->tx_quiesce_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_open_txg < txg) {
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_timelimit_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	while (!tx->tx_exiting) {
+		uint64_t txg = tx->tx_open_txg + 1;
+
+		txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
+
+		if (tx->tx_quiesce_txg_waiting < txg)
+			tx->tx_quiesce_txg_waiting = txg;
+
+		while (!tx->tx_exiting && tx->tx_open_txg < txg) {
+			dprintf("pushing out %llu\n", txg);
+			cv_broadcast(&tx->tx_quiesce_more_cv);
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+		}
+	}
+	txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	/* XXX some code paths suspend when they are already suspended! */
+	rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+	int t;
+
+	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	tl->tl_offset = offset;
+
+	for (t = 0; t < TXG_SIZE; t++)
+		tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++)
+		ASSERT(txg_list_empty(tl, t));
+
+	mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+	return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+	int already_on_list;
+
+	mutex_enter(&tl->tl_lock);
+	already_on_list = tn->tn_member[t];
+	if (!already_on_list) {
+		tn->tn_member[t] = 1;
+		tn->tn_next[t] = tl->tl_head[t];
+		tl->tl_head[t] = tn;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn;
+	void *p = NULL;
+
+	mutex_enter(&tl->tl_lock);
+	if ((tn = tl->tl_head[t]) != NULL) {
+		p = (char *)tn - tl->tl_offset;
+		tl->tl_head[t] = tn->tn_next[t];
+		tn->tn_next[t] = NULL;
+		tn->tn_member[t] = 0;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn, **tp;
+
+	mutex_enter(&tl->tl_lock);
+
+	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+		if ((char *)tn - tl->tl_offset == p) {
+			*tp = tn->tn_next[t];
+			tn->tn_next[t] = NULL;
+			tn->tn_member[t] = 0;
+			mutex_exit(&tl->tl_lock);
+			return (p);
+		}
+	}
+
+	mutex_exit(&tl->tl_lock);
+
+	return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = tl->tl_head[t];
+
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	tn = tn->tn_next[t];
+
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 0000000000..63bff0ae4b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+/* Keep the uberblock version in a varialbe so we can get at it with mdb */
+static uint64_t uberblock_version = UBERBLOCK_VERSION;
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+		byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+	if (ub->ub_magic != UBERBLOCK_MAGIC)
+		return (EINVAL);
+
+	if (ub->ub_version != UBERBLOCK_VERSION)
+		return (ENOTSUP);
+
+	return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+	ASSERT(ub->ub_txg < txg);
+
+	ub->ub_magic = UBERBLOCK_MAGIC;
+	ub->ub_version = UBERBLOCK_VERSION;
+	ub->ub_txg = txg;
+	ub->ub_guid_sum = rvd->vdev_guid_sum;
+	ub->ub_timestamp = gethrestime_sec();
+
+	return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/unique.c b/usr/src/uts/common/fs/zfs/unique.c
new file mode 100644
index 0000000000..56fbddd78e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/unique.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+	avl_node_t un_link;
+	uint64_t un_value;
+} unique_t;
+
+#define	UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+	const unique_t *una = a;
+	const unique_t *unb = b;
+
+	if (una->un_value < unb->un_value)
+		return (-1);
+	if (una->un_value > unb->un_value)
+		return (+1);
+	return (0);
+}
+
+void
+unique_init(void)
+{
+	avl_create(&unique_avl, unique_compare,
+	    sizeof (unique_t), offsetof(unique_t, un_link));
+}
+
+uint64_t
+unique_create(void)
+{
+	return (unique_insert(0));
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+	avl_index_t idx;
+	unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+	un->un_value = value;
+
+	mutex_enter(&unique_mtx);
+	while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+	    avl_find(&unique_avl, un, &idx)) {
+		mutex_exit(&unique_mtx);
+		(void) random_get_pseudo_bytes((void*)&un->un_value,
+		    sizeof (un->un_value));
+		un->un_value &= UNIQUE_MASK;
+		mutex_enter(&unique_mtx);
+	}
+
+	avl_insert(&unique_avl, un, idx);
+	mutex_exit(&unique_mtx);
+
+	return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+	unique_t un_tofind;
+	unique_t *un;
+
+	un_tofind.un_value = value;
+	mutex_enter(&unique_mtx);
+	un = avl_find(&unique_avl, &un_tofind, NULL);
+	if (un != NULL) {
+		avl_remove(&unique_avl, un);
+		kmem_free(un, sizeof (unique_t));
+	}
+	mutex_exit(&unique_mtx);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
new file mode 100644
index 0000000000..990c690bff
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,1738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+	&vdev_root_ops,
+	&vdev_raidz_ops,
+	&vdev_mirror_ops,
+	&vdev_replacing_ops,
+	&vdev_disk_ops,
+	&vdev_file_ops,
+	&vdev_missing_ops,
+	NULL
+};
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+	vdev_ops_t *ops, **opspp;
+
+	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+		if (strcmp(ops->vdev_op_type, type) == 0)
+			break;
+
+	return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children.  This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
+	uint64_t csize;
+	uint64_t c;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+		asize = MAX(asize, csize);
+	}
+
+	return (asize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (vdev < rvd->vdev_children)
+		return (rvd->vdev_child[vdev]);
+
+	return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	int c;
+	vdev_t *mvd;
+
+	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+	int c;
+	vdev_t *mvd;
+
+	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+	size_t oldsize, newsize;
+	uint64_t id = cvd->vdev_id;
+	vdev_t **newchild;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+	ASSERT(cvd->vdev_parent == NULL);
+
+	cvd->vdev_parent = pvd;
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+	oldsize = pvd->vdev_children * sizeof (vdev_t *);
+	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+	newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+	newchild = kmem_zalloc(newsize, KM_SLEEP);
+	if (pvd->vdev_child != NULL) {
+		bcopy(pvd->vdev_child, newchild, oldsize);
+		kmem_free(pvd->vdev_child, oldsize);
+	}
+
+	pvd->vdev_child = newchild;
+	pvd->vdev_child[id] = cvd;
+
+	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+	int c;
+	uint_t id = cvd->vdev_id;
+
+	ASSERT(cvd->vdev_parent == pvd);
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id < pvd->vdev_children);
+	ASSERT(pvd->vdev_child[id] == cvd);
+
+	pvd->vdev_child[id] = NULL;
+	cvd->vdev_parent = NULL;
+
+	for (c = 0; c < pvd->vdev_children; c++)
+		if (pvd->vdev_child[c])
+			break;
+
+	if (c == pvd->vdev_children) {
+		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+		pvd->vdev_child = NULL;
+		pvd->vdev_children = 0;
+	}
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+	vdev_t **newchild, *cvd;
+	int oldc = pvd->vdev_children;
+	int newc, c;
+
+	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+	for (c = newc = 0; c < oldc; c++)
+		if (pvd->vdev_child[c])
+			newc++;
+
+	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+	for (c = newc = 0; c < oldc; c++) {
+		if ((cvd = pvd->vdev_child[c]) != NULL) {
+			newchild[newc] = cvd;
+			cvd->vdev_id = newc++;
+		}
+	}
+
+	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+	pvd->vdev_child = newchild;
+	pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+	vdev_t *vd;
+
+	while (guid == 0)
+		guid = spa_get_random(-1ULL);
+
+	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+	vd->vdev_spa = spa;
+	vd->vdev_id = id;
+	vd->vdev_guid = guid;
+	vd->vdev_guid_sum = guid;
+	vd->vdev_ops = ops;
+	vd->vdev_state = VDEV_STATE_CLOSED;
+
+	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
+	list_create(&vd->vdev_io_pending, sizeof (zio_t),
+	    offsetof(zio_t, io_pending));
+	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	txg_list_create(&vd->vdev_ms_list,
+	    offsetof(struct metaslab, ms_txg_node));
+	txg_list_create(&vd->vdev_dtl_list,
+	    offsetof(struct vdev, vdev_dtl_node));
+	vd->vdev_stat.vs_timestamp = gethrtime();
+
+	return (vd);
+}
+
+/*
+ * Free a vdev_t that has been removed from service.
+ */
+static void
+vdev_free_common(vdev_t *vd)
+{
+	if (vd->vdev_path)
+		spa_strfree(vd->vdev_path);
+	if (vd->vdev_devid)
+		spa_strfree(vd->vdev_devid);
+
+	txg_list_destroy(&vd->vdev_ms_list);
+	txg_list_destroy(&vd->vdev_dtl_list);
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_map);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_scrub);
+	mutex_exit(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dirty_lock);
+	list_destroy(&vd->vdev_io_pending);
+	mutex_destroy(&vd->vdev_io_lock);
+	cv_destroy(&vd->vdev_io_cv);
+
+	kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Allocate a new vdev.  The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+vdev_t *
+vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
+{
+	vdev_ops_t *ops;
+	char *type;
+	uint64_t guid = 0;
+	vdev_t *vd;
+
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (NULL);
+
+	if ((ops = vdev_getops(type)) == NULL)
+		return (NULL);
+
+	/*
+	 * If this is a load, get the vdev guid from the nvlist.
+	 * Otherwise, vdev_alloc_common() will generate one for us.
+	 */
+	if (alloctype == VDEV_ALLOC_LOAD) {
+		uint64_t label_id;
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+		    label_id != id)
+			return (NULL);
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (NULL);
+	}
+
+	vd = vdev_alloc_common(spa, id, guid, ops);
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+		vd->vdev_path = spa_strdup(vd->vdev_path);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+
+	/*
+	 * If we're a top-level vdev, try to load the allocation parameters.
+	 */
+	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    &vd->vdev_ms_array);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    &vd->vdev_ms_shift);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    &vd->vdev_asize);
+	}
+
+	/*
+	 * If we're a leaf vdev, try to load the DTL object.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+		    &vd->vdev_dtl.smo_object);
+	}
+
+	/*
+	 * Add ourselves to the parent's list of children.
+	 */
+	vdev_add_child(parent, vd);
+
+	return (vd);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+	int c;
+
+	/*
+	 * vdev_free() implies closing the vdev first.  This is simpler than
+	 * trying to ensure complicated semantics for all callers.
+	 */
+	vdev_close(vd);
+
+	/*
+	 * It's possible to free a vdev that's been added to the dirty
+	 * list when in the middle of spa_vdev_add().  Handle that case
+	 * correctly here.
+	 */
+	if (vd->vdev_is_dirty)
+		vdev_config_clean(vd);
+
+	/*
+	 * Free all children.
+	 */
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_free(vd->vdev_child[c]);
+
+	ASSERT(vd->vdev_child == NULL);
+	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+	/*
+	 * Discard allocation state.
+	 */
+	if (vd == vd->vdev_top)
+		vdev_metaslab_fini(vd);
+
+	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+	/*
+	 * Remove this vdev from its parent's child list.
+	 */
+	vdev_remove_child(vd->vdev_parent, vd);
+
+	ASSERT(vd->vdev_parent == NULL);
+
+	vdev_free_common(vd);
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+	spa_t *spa = svd->vdev_spa;
+	metaslab_t *msp;
+	vdev_t *vd;
+	int t;
+
+	ASSERT(tvd == tvd->vdev_top);
+
+	tvd->vdev_ms_array = svd->vdev_ms_array;
+	tvd->vdev_ms_shift = svd->vdev_ms_shift;
+	tvd->vdev_ms_count = svd->vdev_ms_count;
+
+	svd->vdev_ms_array = 0;
+	svd->vdev_ms_shift = 0;
+	svd->vdev_ms_count = 0;
+
+	tvd->vdev_mg = svd->vdev_mg;
+	tvd->vdev_mg->mg_vd = tvd;
+	tvd->vdev_ms = svd->vdev_ms;
+	tvd->vdev_smo = svd->vdev_smo;
+
+	svd->vdev_mg = NULL;
+	svd->vdev_ms = NULL;
+	svd->vdev_smo = NULL;
+
+	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+
+	svd->vdev_stat.vs_alloc = 0;
+	svd->vdev_stat.vs_space = 0;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
+		svd->vdev_dirty[t] = 0;
+	}
+
+	if (svd->vdev_is_dirty) {
+		vdev_config_clean(svd);
+		vdev_config_dirty(tvd);
+	}
+
+	ASSERT(svd->vdev_io_retry == NULL);
+	ASSERT(list_is_empty(&svd->vdev_io_pending));
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+	int c;
+
+	if (vd == NULL)
+		return;
+
+	vd->vdev_top = tvd;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+	spa_t *spa = cvd->vdev_spa;
+	vdev_t *pvd = cvd->vdev_parent;
+	vdev_t *mvd;
+
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
+	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+	vdev_remove_child(pvd, cvd);
+	vdev_add_child(pvd, mvd);
+	cvd->vdev_id = mvd->vdev_children;
+	vdev_add_child(mvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	mvd->vdev_asize = cvd->vdev_asize;
+	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_state = cvd->vdev_state;
+
+	if (mvd == mvd->vdev_top)
+		vdev_top_transfer(cvd, mvd);
+
+	return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+	vdev_t *mvd = cvd->vdev_parent;
+	vdev_t *pvd = mvd->vdev_parent;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+	ASSERT(mvd->vdev_children == 1);
+	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+	    mvd->vdev_ops == &vdev_replacing_ops);
+
+	vdev_remove_child(mvd, cvd);
+	vdev_remove_child(pvd, mvd);
+	cvd->vdev_id = mvd->vdev_id;
+	vdev_add_child(pvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	if (cvd == cvd->vdev_top)
+		vdev_top_transfer(mvd, cvd);
+
+	ASSERT(mvd->vdev_children == 0);
+	vdev_free(mvd);
+}
+
+void
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	metaslab_class_t *mc = spa_metaslab_class_select(spa);
+	uint64_t c;
+	uint64_t oldc = vd->vdev_ms_count;
+	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+	space_map_obj_t *smo = vd->vdev_smo;
+	metaslab_t **mspp = vd->vdev_ms;
+
+	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+	ASSERT(oldc <= newc);
+
+	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
+	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+	vd->vdev_ms_count = newc;
+
+	if (vd->vdev_mg == NULL) {
+		if (txg == 0) {
+			dmu_buf_t *db;
+			uint64_t *ms_array;
+
+			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
+			    KM_SLEEP);
+
+			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
+			    0, newc * sizeof (uint64_t), ms_array);
+
+			for (c = 0; c < newc; c++) {
+				if (ms_array[c] == 0)
+					continue;
+				db = dmu_bonus_hold(spa->spa_meta_objset,
+				    ms_array[c]);
+				dmu_buf_read(db);
+				ASSERT3U(db->db_size, ==, sizeof (*smo));
+				bcopy(db->db_data, &vd->vdev_smo[c],
+				    db->db_size);
+				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
+				    ms_array[c]);
+				dmu_buf_rele(db);
+			}
+			kmem_free(ms_array, newc * sizeof (uint64_t));
+		}
+		vd->vdev_mg = metaslab_group_create(mc, vd);
+	}
+
+	for (c = 0; c < oldc; c++) {
+		vd->vdev_smo[c] = smo[c];
+		vd->vdev_ms[c] = mspp[c];
+		mspp[c]->ms_smo = &vd->vdev_smo[c];
+	}
+
+	for (c = oldc; c < newc; c++)
+		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
+		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+	if (oldc != 0) {
+		kmem_free(smo, oldc * sizeof (*smo));
+		kmem_free(mspp, oldc * sizeof (*mspp));
+	}
+
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+	uint64_t m;
+	uint64_t count = vd->vdev_ms_count;
+
+	if (vd->vdev_ms != NULL) {
+		for (m = 0; m < count; m++)
+			metaslab_fini(vd->vdev_ms[m]);
+		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+		vd->vdev_ms = NULL;
+	}
+
+	if (vd->vdev_smo != NULL) {
+		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
+		vd->vdev_smo = NULL;
+	}
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+	int error;
+	vdev_knob_t *vk;
+	int c;
+	uint64_t osize = 0;
+	uint64_t asize, psize;
+	uint64_t ashift = -1ULL;
+
+	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+	    vd->vdev_state == VDEV_STATE_OFFLINE);
+
+	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+		vd->vdev_fault_arg >>= 1;
+	else
+		vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
+		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
+
+		*valp = vk->vk_default;
+		*valp = MAX(*valp, vk->vk_min);
+		*valp = MIN(*valp, vk->vk_max);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_cache_init(vd);
+		vdev_queue_init(vd);
+		vd->vdev_cache_active = B_TRUE;
+	}
+
+	if (vd->vdev_offline) {
+		ASSERT(vd->vdev_children == 0);
+		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
+		vd->vdev_state = VDEV_STATE_OFFLINE;
+		return (ENXIO);
+	}
+
+	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+	dprintf("%s = %d, osize %llu, state = %d\n",
+	    vdev_description(vd), error, osize, vd->vdev_state);
+
+	if (error) {
+		dprintf("%s in %s failed to open, error %d, aux %d\n",
+		    vdev_description(vd),
+		    vdev_description(vd->vdev_parent),
+		    error,
+		    vd->vdev_stat.vs_aux);
+
+		vd->vdev_state = VDEV_STATE_CANT_OPEN;
+		return (error);
+	}
+
+	vd->vdev_state = VDEV_STATE_HEALTHY;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
+			vd->vdev_state = VDEV_STATE_DEGRADED;
+
+	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+	if (vd->vdev_children == 0) {
+		if (osize < SPA_MINDEVSIZE) {
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			return (EOVERFLOW);
+		}
+		psize = osize;
+		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+	} else {
+		if (osize < SPA_MINDEVSIZE -
+		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			return (EOVERFLOW);
+		}
+		psize = 0;
+		asize = osize;
+	}
+
+	vd->vdev_psize = psize;
+
+	if (vd->vdev_asize == 0) {
+		/*
+		 * This is the first-ever open, so use the computed values.
+		 */
+		vd->vdev_asize = asize;
+		vd->vdev_ashift = ashift;
+	} else {
+		/*
+		 * Make sure the alignment requirement hasn't increased.
+		 */
+		if (ashift > vd->vdev_ashift) {
+			dprintf("%s: ashift grew\n", vdev_description(vd));
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+
+		/*
+		 * Make sure the device hasn't shrunk.
+		 */
+		if (asize < vd->vdev_asize) {
+			dprintf("%s: device shrank\n", vdev_description(vd));
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+
+		/*
+		 * If all children are healthy and the asize has increased,
+		 * then we've experienced dynamic LUN growth.
+		 */
+		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+		    asize > vd->vdev_asize) {
+			dprintf("%s: device grew\n", vdev_description(vd));
+			vd->vdev_asize = asize;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
+
+	vd->vdev_ops->vdev_op_close(vd);
+
+	if (vd->vdev_cache_active) {
+		vdev_cache_fini(vd);
+		vdev_queue_fini(vd);
+		vd->vdev_cache_active = B_FALSE;
+	}
+
+	if (vd->vdev_offline)
+		vd->vdev_state = VDEV_STATE_OFFLINE;
+	else
+		vd->vdev_state = VDEV_STATE_CLOSED;
+}
+
+void
+vdev_reopen(vdev_t *vd, zio_t **rq)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int c;
+
+	if (vd == rvd) {
+		ASSERT(rq == NULL);
+		for (c = 0; c < rvd->vdev_children; c++)
+			vdev_reopen(rvd->vdev_child[c], NULL);
+		return;
+	}
+
+	/* only valid for top-level vdevs */
+	ASSERT3P(vd, ==, vd->vdev_top);
+
+	/*
+	 * vdev_state can change when spa_config_lock is held as writer,
+	 * or when it's held as reader and we're doing a vdev_reopen().
+	 * To handle the latter case, we grab rvd's io_lock to serialize
+	 * reopens.  This ensures that there's never more than one vdev
+	 * state changer active at a time.
+	 */
+	mutex_enter(&rvd->vdev_io_lock);
+
+	mutex_enter(&vd->vdev_io_lock);
+	while (list_head(&vd->vdev_io_pending) != NULL)
+		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
+	vdev_close(vd);
+	(void) vdev_open(vd);
+	if (rq != NULL) {
+		*rq = vd->vdev_io_retry;
+		vd->vdev_io_retry = NULL;
+	}
+	mutex_exit(&vd->vdev_io_lock);
+
+	/*
+	 * Reassess root vdev's health.
+	 */
+	rvd->vdev_state = VDEV_STATE_HEALTHY;
+	for (c = 0; c < rvd->vdev_children; c++) {
+		uint64_t state = rvd->vdev_child[c]->vdev_state;
+		rvd->vdev_state = MIN(rvd->vdev_state, state);
+	}
+
+	mutex_exit(&rvd->vdev_io_lock);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg)
+{
+	int error;
+
+	/*
+	 * Normally, partial opens (e.g. of a mirror) are allowed.
+	 * For a create, however, we want to fail the request if
+	 * there are any components we can't open.
+	 */
+	error = vdev_open(vd);
+
+	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+		vdev_close(vd);
+		return (error ? error : ENXIO);
+	}
+
+	/*
+	 * Recursively initialize all labels.
+	 */
+	if ((error = vdev_label_init(vd, txg)) != 0) {
+		vdev_close(vd);
+		return (error);
+	}
+
+	return (0);
+}
+
+/*
+ * The is the latter half of vdev_create().  It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+	/*
+	 * Aim for roughly 200 metaslabs per vdev.
+	 */
+	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+	/*
+	 * Initialize the vdev's metaslabs.
+	 */
+	vdev_metaslab_init(vd, txg);
+}
+
+void
+vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
+{
+	vdev_t *tvd = vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_dirty_lock);
+	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
+		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
+		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
+		    tvd, txg);
+	}
+	mutex_exit(&tvd->vdev_dirty_lock);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+	mutex_enter(sm->sm_lock);
+	if (!space_map_contains(sm, txg, size))
+		space_map_add(sm, txg, size);
+	mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+	int dirty;
+
+	/*
+	 * Quick test without the lock -- covers the common case that
+	 * there are no dirty time segments.
+	 */
+	if (sm->sm_space == 0)
+		return (0);
+
+	mutex_enter(sm->sm_lock);
+	dirty = space_map_contains(sm, txg, size);
+	mutex_exit(sm->sm_lock);
+
+	return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+	int c;
+
+	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+
+	if (vd->vdev_children == 0) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		/*
+		 * We're successfully scrubbed everything up to scrub_txg.
+		 * Therefore, excise all old DTLs up to that point, then
+		 * fold in the DTLs for everything we couldn't scrub.
+		 */
+		if (scrub_txg != 0) {
+			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+		}
+		if (scrub_done)
+			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+		mutex_exit(&vd->vdev_dtl_lock);
+		if (txg != 0) {
+			vdev_t *tvd = vd->vdev_top;
+			vdev_dirty(tvd, VDD_DTL, txg);
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+		}
+		return;
+	}
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+		mutex_enter(&vd->vdev_dtl_lock);
+		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+		mutex_exit(&vd->vdev_dtl_lock);
+	}
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_obj_t *smo = &vd->vdev_dtl;
+	dmu_buf_t *db;
+	int error;
+
+	ASSERT(vd->vdev_children == 0);
+
+	if (smo->smo_object == 0)
+		return (0);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	dmu_buf_read(db);
+	ASSERT3U(db->db_size, ==, sizeof (*smo));
+	bcopy(db->db_data, smo, db->db_size);
+	dmu_buf_rele(db);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
+	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_obj_t *smo = &vd->vdev_dtl;
+	space_map_t *sm = &vd->vdev_dtl_map;
+	space_map_t smsync;
+	kmutex_t smlock;
+	avl_tree_t *t = &sm->sm_root;
+	space_seg_t *ss;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+
+	dprintf("%s in txg %llu pass %d\n",
+	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	if (vd->vdev_detached) {
+		if (smo->smo_object != 0) {
+			int err = dmu_object_free(spa->spa_meta_objset,
+			    smo->smo_object, tx);
+			ASSERT3U(err, ==, 0);
+			smo->smo_object = 0;
+		}
+		dmu_tx_commit(tx);
+		return;
+	}
+
+	if (smo->smo_object == 0) {
+		ASSERT(smo->smo_objsize == 0);
+		ASSERT(smo->smo_alloc == 0);
+		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+		ASSERT(smo->smo_object != 0);
+		vdev_config_dirty(vd->vdev_top);
+	}
+
+	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+	    0, smo->smo_objsize, tx);
+
+	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+	    &smlock);
+
+	mutex_enter(&smlock);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
+		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	smo->smo_objsize = 0;
+	smo->smo_alloc = smsync.sm_space;
+
+	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
+	space_map_destroy(&smsync);
+
+	mutex_exit(&smlock);
+	mutex_destroy(&smlock);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	dmu_buf_will_dirty(db, tx);
+	ASSERT3U(db->db_size, ==, sizeof (*smo));
+	bcopy(smo, db->db_data, db->db_size);
+	dmu_buf_rele(db);
+
+	dmu_tx_commit(tx);
+}
+
+int
+vdev_load(vdev_t *vd, int import)
+{
+	spa_t *spa = vd->vdev_spa;
+	int c, error;
+	nvlist_t *label;
+	uint64_t guid, state;
+
+	dprintf("loading %s\n", vdev_description(vd));
+
+	/*
+	 * Recursively load all children.
+	 */
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+			return (error);
+
+	/*
+	 * If this is a leaf vdev, make sure its agrees with its disk labels.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+
+		if (vdev_is_dead(vd))
+			return (0);
+
+		/*
+		 * XXX state transitions don't propagate to parent here.
+		 * Also, merely setting the state isn't sufficient because
+		 * it's not persistent; a vdev_reopen() would make us
+		 * forget all about it.
+		 */
+		if ((label = vdev_label_read_config(vd)) == NULL) {
+			dprintf("can't load label config\n");
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+		    &guid) != 0 || guid != spa_guid(spa)) {
+			dprintf("bad or missing pool GUID (%llu)\n", guid);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
+		    guid != vd->vdev_guid) {
+			dprintf("bad or missing vdev guid (%llu != %llu)\n",
+			    guid, vd->vdev_guid);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		/*
+		 * If we find a vdev with a matching pool guid and vdev guid,
+		 * but the pool state is not active, it indicates that the user
+		 * exported or destroyed the pool without affecting the config
+		 * cache (if / was mounted readonly, for example).  In this
+		 * case, immediately return EBADF so the caller can remove it
+		 * from the config.
+		 */
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    &state)) {
+			dprintf("missing pool state\n");
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		if (state != POOL_STATE_ACTIVE &&
+		    (!import || state != POOL_STATE_EXPORTED)) {
+			dprintf("pool state not active (%llu)\n", state);
+			nvlist_free(label);
+			return (EBADF);
+		}
+
+		nvlist_free(label);
+	}
+
+	/*
+	 * If this is a top-level vdev, make sure its allocation parameters
+	 * exist and initialize its metaslabs.
+	 */
+	if (vd == vd->vdev_top) {
+
+		if (vd->vdev_ms_array == 0 ||
+		    vd->vdev_ms_shift == 0 ||
+		    vd->vdev_ashift == 0 ||
+		    vd->vdev_asize == 0) {
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+
+		vdev_metaslab_init(vd, 0);
+	}
+
+	/*
+	 * If this is a leaf vdev, load its DTL.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+		error = vdev_dtl_load(vd);
+		if (error) {
+			dprintf("can't load DTL for %s, error %d\n",
+			    vdev_description(vd), error);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+	}
+
+	return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+	metaslab_t *msp;
+
+	dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+		metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_add_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	ASSERT(vd == vd->vdev_top);
+
+	if (vd->vdev_ms_array == 0)
+		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+
+	ASSERT(vd->vdev_ms_array != 0);
+
+	vdev_config_dirty(vd);
+
+	dmu_tx_commit(tx);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *lvd;
+	metaslab_t *msp;
+	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
+	uint8_t dirty = *dirtyp;
+
+	mutex_enter(&vd->vdev_dirty_lock);
+	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
+	mutex_exit(&vd->vdev_dirty_lock);
+
+	dprintf("%s txg %llu pass %d\n",
+	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+	if (dirty & VDD_ADD)
+		vdev_add_sync(vd, txg);
+
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
+		metaslab_sync(msp, txg);
+
+	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+		vdev_dtl_sync(lvd, txg);
+
+	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+	return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+void
+vdev_io_start(zio_t *zio)
+{
+	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
+}
+
+void
+vdev_io_done(zio_t *zio)
+{
+	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+	if (vd == NULL || vd->vdev_ops == NULL)
+		return ("<unknown>");
+
+	if (vd->vdev_path != NULL)
+		return (vd->vdev_path);
+
+	if (vd->vdev_parent == NULL)
+		return (spa_name(vd->vdev_spa));
+
+	return (vd->vdev_ops->vdev_op_type);
+}
+
+int
+vdev_online(spa_t *spa, const char *path)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	dprintf("ONLINE: %s\n", vdev_description(vd));
+
+	vd->vdev_offline = B_FALSE;
+
+	/*
+	 * Clear the error counts.  The idea is that you expect to see all
+	 * zeroes when everything is working, so if you've just onlined a
+	 * device, you don't want to keep hearing about errors from before.
+	 */
+	vd->vdev_stat.vs_read_errors = 0;
+	vd->vdev_stat.vs_write_errors = 0;
+	vd->vdev_stat.vs_checksum_errors = 0;
+
+	vdev_reopen(vd->vdev_top, NULL);
+
+	spa_config_exit(spa);
+
+	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+int
+vdev_offline(spa_t *spa, const char *path)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	dprintf("OFFLINE: %s\n", vdev_description(vd));
+
+	/*
+	 * If this device's top-level vdev has a non-empty DTL,
+	 * don't allow the device to be offlined.
+	 *
+	 * XXX -- we should make this more precise by allowing the offline
+	 * as long as the remaining devices don't have any DTL holes.
+	 */
+	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
+		spa_config_exit(spa);
+		return (EBUSY);
+	}
+
+	/*
+	 * Set this device to offline state and reopen its top-level vdev.
+	 * If this action results in the top-level vdev becoming unusable,
+	 * undo it and fail the request.
+	 */
+	vd->vdev_offline = B_TRUE;
+	vdev_reopen(vd->vdev_top, NULL);
+	if (vdev_is_dead(vd->vdev_top)) {
+		vd->vdev_offline = B_FALSE;
+		vdev_reopen(vd->vdev_top, NULL);
+		spa_config_exit(spa);
+		return (EBUSY);
+	}
+
+	spa_config_exit(spa);
+
+	return (0);
+}
+
+int
+vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	vd->vdev_fault_mode = mode;
+	vd->vdev_fault_mask = mask;
+	vd->vdev_fault_arg = arg;
+
+	spa_config_exit(spa);
+
+	return (0);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+	int error = 0;
+
+	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+		return (0);
+
+	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+		return (0);
+
+	switch (vd->vdev_fault_mode) {
+	case VDEV_FAULT_RANDOM:
+		if (spa_get_random(vd->vdev_fault_arg) == 0)
+			error = EIO;
+		break;
+
+	case VDEV_FAULT_COUNT:
+		if ((int64_t)--vd->vdev_fault_arg <= 0)
+			vd->vdev_fault_mode = VDEV_FAULT_NONE;
+		error = EIO;
+		break;
+	}
+
+	if (error != 0) {
+		dprintf("returning %d for type %d on %s state %d offset %llx\n",
+		    error, zio->io_type, vdev_description(vd),
+		    vd->vdev_state, zio->io_offset);
+	}
+
+	return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int c, t;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+	vs->vs_state = vd->vdev_state;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	/*
+	 * If we're getting stats on the root vdev, aggregate the I/O counts
+	 * over all top-level vdevs (i.e. the direct children of the root).
+	 */
+	if (vd == rvd) {
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *cvd = rvd->vdev_child[c];
+			vdev_stat_t *cvs = &cvd->vdev_stat;
+
+			mutex_enter(&vd->vdev_stat_lock);
+			for (t = 0; t < ZIO_TYPES; t++) {
+				vs->vs_ops[t] += cvs->vs_ops[t];
+				vs->vs_bytes[t] += cvs->vs_bytes[t];
+			}
+			vs->vs_read_errors += cvs->vs_read_errors;
+			vs->vs_write_errors += cvs->vs_write_errors;
+			vs->vs_checksum_errors += cvs->vs_checksum_errors;
+			vs->vs_scrub_examined += cvs->vs_scrub_examined;
+			vs->vs_scrub_errors += cvs->vs_scrub_errors;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+	}
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *pvd;
+	uint64_t txg = zio->io_txg;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	zio_type_t type = zio->io_type;
+	int flags = zio->io_flags;
+
+	if (zio->io_error == 0) {
+		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+			mutex_enter(&vd->vdev_stat_lock);
+			vs->vs_ops[type]++;
+			vs->vs_bytes[type] += zio->io_size;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+		if ((flags & ZIO_FLAG_IO_REPAIR) &&
+		    zio->io_delegate_list == NULL) {
+			mutex_enter(&vd->vdev_stat_lock);
+			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
+				vs->vs_scrub_repaired += zio->io_size;
+			else
+				vs->vs_self_healed += zio->io_size;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+		return;
+	}
+
+	if (flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	if (!vdev_is_dead(vd)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		if (type == ZIO_TYPE_READ) {
+			if (zio->io_error == ECKSUM)
+				vs->vs_checksum_errors++;
+			else
+				vs->vs_read_errors++;
+		}
+		if (type == ZIO_TYPE_WRITE)
+			vs->vs_write_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+
+	if (type == ZIO_TYPE_WRITE) {
+		if (txg == 0 || vd->vdev_children != 0)
+			return;
+		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+		}
+		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+			vdev_t *tvd = vd->vdev_top;
+			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+				return;
+			vdev_dirty(tvd, VDD_DTL, txg);
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+		}
+	}
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+	int c;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+	mutex_enter(&vd->vdev_stat_lock);
+
+	if (type == POOL_SCRUB_NONE) {
+		/*
+		 * Update completion and end time.  Leave everything else alone
+		 * so we can report what happened during the previous scrub.
+		 */
+		vs->vs_scrub_complete = complete;
+		vs->vs_scrub_end = gethrestime_sec();
+	} else {
+		vs->vs_scrub_type = type;
+		vs->vs_scrub_complete = 0;
+		vs->vs_scrub_examined = 0;
+		vs->vs_scrub_repaired = 0;
+		vs->vs_scrub_errors = 0;
+		vs->vs_scrub_start = gethrestime_sec();
+		vs->vs_scrub_end = 0;
+	}
+
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Report checksum errors that a vdev that didn't realize it made.
+ * This can happen, for example, when RAID-Z combinatorial reconstruction
+ * infers that one of its components returned bad data.
+ */
+void
+vdev_checksum_error(zio_t *zio, vdev_t *vd)
+{
+	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+	    vdev_description(vd));
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_checksum_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
+{
+	ASSERT(vd == vd->vdev_top);
+
+	do {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_space += space_delta;
+		vd->vdev_stat.vs_alloc += alloc_delta;
+		mutex_exit(&vd->vdev_stat_lock);
+	} while ((vd = vd->vdev_parent) != NULL);
+}
+
+/*
+ * Various knobs to tune a vdev.
+ */
+static vdev_knob_t vdev_knob[] = {
+	{
+		"cache_size",
+		"size of the read-ahead cache",
+		0,
+		1ULL << 30,
+		10ULL << 20,
+		offsetof(struct vdev, vdev_cache.vc_size)
+	},
+	{
+		"cache_bshift",
+		"log2 of cache blocksize",
+		SPA_MINBLOCKSHIFT,
+		SPA_MAXBLOCKSHIFT,
+		16,
+		offsetof(struct vdev, vdev_cache.vc_bshift)
+	},
+	{
+		"cache_max",
+		"largest block size to cache",
+		0,
+		SPA_MAXBLOCKSIZE,
+		1ULL << 14,
+		offsetof(struct vdev, vdev_cache.vc_max)
+	},
+	{
+		"min_pending",
+		"minimum pending I/Os to the disk",
+		1,
+		10000,
+		2,
+		offsetof(struct vdev, vdev_queue.vq_min_pending)
+	},
+	{
+		"max_pending",
+		"maximum pending I/Os to the disk",
+		1,
+		10000,
+		35,
+		offsetof(struct vdev, vdev_queue.vq_max_pending)
+	},
+	{
+		"agg_limit",
+		"maximum size of aggregated I/Os",
+		0,
+		SPA_MAXBLOCKSIZE,
+		SPA_MAXBLOCKSIZE,
+		offsetof(struct vdev, vdev_queue.vq_agg_limit)
+	},
+	{
+		"time_shift",
+		"deadline = pri + (lbolt >> time_shift)",
+		0,
+		63,
+		4,
+		offsetof(struct vdev, vdev_queue.vq_time_shift)
+	},
+	{
+		"ramp_rate",
+		"exponential I/O issue ramp-up rate",
+		1,
+		10000,
+		2,
+		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
+	},
+};
+
+vdev_knob_t *
+vdev_knob_next(vdev_knob_t *vk)
+{
+	if (vk == NULL)
+		return (vdev_knob);
+
+	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
+		return (NULL);
+
+	return (vk);
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int c;
+
+	if (vd == rvd) {
+		for (c = 0; c < rvd->vdev_children; c++)
+			vdev_config_dirty(rvd->vdev_child[c]);
+	} else {
+		ASSERT(vd == vd->vdev_top);
+
+		if (!vd->vdev_is_dirty) {
+			list_insert_head(&spa->spa_dirty_list, vd);
+			vd->vdev_is_dirty = B_TRUE;
+		}
+	}
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+	ASSERT(vd->vdev_is_dirty);
+
+	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
+	vd->vdev_is_dirty = B_FALSE;
+}
+
+/*
+ * Set a vdev's state, updating any parent's state as well.
+ */
+void
+vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+{
+	if (state == vd->vdev_state)
+		return;
+
+	vd->vdev_state = state;
+	vd->vdev_stat.vs_aux = aux;
+
+	if (vd->vdev_parent != NULL) {
+		int c;
+		int degraded = 0, faulted = 0;
+		vdev_t *parent, *child;
+
+		parent = vd->vdev_parent;
+		for (c = 0; c < parent->vdev_children; c++) {
+			child = parent->vdev_child[c];
+			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+				faulted++;
+			else if (child->vdev_state == VDEV_STATE_DEGRADED)
+				degraded++;
+		}
+
+		vd->vdev_parent->vdev_ops->vdev_op_state_change(
+		    vd->vdev_parent, faulted, degraded);
+	    }
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 0000000000..e1e7c1a36f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache.  When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result.  In the best case, this can turn 256 back-to-back 512-byte
+ * reads into a single 128k read followed by 255 cache hits; this reduces
+ * latency dramatically.  In the worst case, it can turn an isolated 512-byte
+ * read into a 128k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth.  A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region.  It could also
+ * take advantage of semantic information about the I/O.  And it could use
+ * something faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate.  This reserves a cache entry for the specified region.
+ *     We separate the allocate and fill operations so that multiple threads
+ *     don't generate I/O for the same cache miss.
+ *
+ * (2) Fill.  When the I/O for a cache miss completes, the fill routine
+ *     places the data in the previously allocated cache entry.
+ *
+ * (3) Read.  Read data from the cache.
+ *
+ * (4) Write.  Update cache contents after write completion.
+ *
+ * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
+ *     if the total cache size exceeds vc_size.
+ */
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = a1;
+	const vdev_cache_entry_t *ve2 = a2;
+
+	if (ve1->ve_offset < ve2->ve_offset)
+		return (-1);
+	if (ve1->ve_offset > ve2->ve_offset)
+		return (1);
+	return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = a1;
+	const vdev_cache_entry_t *ve2 = a2;
+
+	if (ve1->ve_lastused < ve2->ve_lastused)
+		return (-1);
+	if (ve1->ve_lastused > ve2->ve_lastused)
+		return (1);
+
+	/*
+	 * Among equally old entries, sort by offset to ensure uniqueness.
+	 */
+	return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT(ve->ve_fill_io == NULL);
+	ASSERT(ve->ve_data != NULL);
+
+	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+	    vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+	    ve->ve_hits, ve->ve_missed_update);
+
+	avl_remove(&vc->vc_lastused_tree, ve);
+	avl_remove(&vc->vc_offset_tree, ve);
+	zio_buf_free(ve->ve_data, vc->vc_blocksize);
+	kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache.  At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+	vdev_cache_entry_t *ve;
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+	if (vc->vc_size == 0)
+		return (NULL);
+
+	/*
+	 * If adding a new entry would exceed the cache size,
+	 * evict the oldest entry (LRU).
+	 */
+	if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) >
+	    vc->vc_size) {
+		ve = avl_first(&vc->vc_lastused_tree);
+		if (ve->ve_fill_io != NULL) {
+			dprintf("can't evict in %p, still filling\n", vc);
+			return (NULL);
+		}
+		ASSERT(ve->ve_hits != 0);
+		vdev_cache_evict(vc, ve);
+	}
+
+	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+	ve->ve_offset = offset;
+	ve->ve_lastused = lbolt;
+	ve->ve_data = zio_buf_alloc(vc->vc_blocksize);
+
+	avl_add(&vc->vc_offset_tree, ve);
+	avl_add(&vc->vc_lastused_tree, ve);
+
+	return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT(ve->ve_fill_io == NULL);
+
+	if (ve->ve_lastused != lbolt) {
+		avl_remove(&vc->vc_lastused_tree, ve);
+		ve->ve_lastused = lbolt;
+		avl_add(&vc->vc_lastused_tree, ve);
+	}
+
+	ve->ve_hits++;
+	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve = zio->io_private;
+	zio_t *dio;
+
+	ASSERT(zio->io_size == vc->vc_blocksize);
+
+	/*
+	 * Add data to the cache.
+	 */
+	mutex_enter(&vc->vc_lock);
+
+	ASSERT(ve->ve_fill_io == zio);
+	ASSERT(ve->ve_offset == zio->io_offset);
+	ASSERT(ve->ve_data == zio->io_data);
+
+	ve->ve_fill_io = NULL;
+
+	/*
+	 * Even if this cache line was invalidated by a missed write update,
+	 * any reads that were queued up before the missed update are still
+	 * valid, so we can satisfy them from this line before we evict it.
+	 */
+	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+		vdev_cache_hit(vc, ve, dio);
+
+	if (zio->io_error || ve->ve_missed_update)
+		vdev_cache_evict(vc, ve);
+
+	mutex_exit(&vc->vc_lock);
+
+	while ((dio = zio->io_delegate_list) != NULL) {
+		zio->io_delegate_list = dio->io_delegate_next;
+		dio->io_delegate_next = NULL;
+		dio->io_error = zio->io_error;
+		zio_next_stage(dio);
+	}
+}
+
+/*
+ * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, ve_search;
+	uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+	zio_t *fio;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+		return (EINVAL);
+
+	if (zio->io_size > vc->vc_max)
+		return (EOVERFLOW);
+
+	/*
+	 * If the I/O straddles two or more cache blocks, don't cache it.
+	 */
+	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1,
+	    vc->vc_blocksize))
+		return (EXDEV);
+
+	ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search.ve_offset = cache_offset;
+	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+	if (ve != NULL) {
+		if (ve->ve_missed_update) {
+			mutex_exit(&vc->vc_lock);
+			return (ESTALE);
+		}
+
+		if ((fio = ve->ve_fill_io) != NULL) {
+			zio->io_delegate_next = fio->io_delegate_list;
+			fio->io_delegate_list = zio;
+			zio_vdev_io_bypass(zio);
+			mutex_exit(&vc->vc_lock);
+			return (0);
+		}
+
+		vdev_cache_hit(vc, ve, zio);
+		zio_vdev_io_bypass(zio);
+
+		mutex_exit(&vc->vc_lock);
+		zio_next_stage(zio);
+		return (0);
+	}
+
+	ve = vdev_cache_allocate(zio);
+
+	if (ve == NULL) {
+		mutex_exit(&vc->vc_lock);
+		return (ENOMEM);
+	}
+
+	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+	    ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
+	    ZIO_PRIORITY_CACHE_FILL,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    vdev_cache_fill, ve);
+
+	ve->ve_fill_io = fio;
+	fio->io_delegate_list = zio;
+	zio_vdev_io_bypass(zio);
+
+	mutex_exit(&vc->vc_lock);
+	zio_nowait(fio);
+
+	return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, ve_search;
+	uint64_t io_start = zio->io_offset;
+	uint64_t io_end = io_start + zio->io_size;
+	uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize);
+	uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize);
+	avl_index_t where;
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search.ve_offset = min_offset;
+	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+	if (ve == NULL)
+		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+	while (ve != NULL && ve->ve_offset < max_offset) {
+		uint64_t start = MAX(ve->ve_offset, io_start);
+		uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end);
+
+		if (ve->ve_fill_io != NULL) {
+			ve->ve_missed_update = 1;
+		} else {
+			bcopy((char *)zio->io_data + start - io_start,
+			    ve->ve_data + start - ve->ve_offset, end - start);
+		}
+		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+	}
+	mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+
+	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_offset_node));
+
+	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_lastused_node));
+
+	vc->vc_blocksize = 1ULL << vc->vc_bshift;
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve;
+
+	mutex_enter(&vc->vc_lock);
+	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+		vdev_cache_evict(vc, ve);
+	mutex_exit(&vc->vc_lock);
+
+	avl_destroy(&vc->vc_offset_tree);
+	avl_destroy(&vc->vc_lastused_tree);
+
+	mutex_destroy(&vc->vc_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 0000000000..9255ecf03e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunddi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+	buf_t	vdb_buf;
+	zio_t	*vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_disk_t *dvd;
+	int error;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+	/*
+	 * When opening a disk device, we want to preserve the user's original
+	 * intent.  We always want to open the device by the path the user gave
+	 * us, even if it is one of multiple paths to the save device.  But we
+	 * also want to be able to survive disks being removed/recabled.
+	 * Therefore the sequence of opening devices is:
+	 *
+	 * 1. Try opening the device by path.
+	 *
+	 * 	a. First append "s0" to see if this is a whole disk
+	 * 	b. Fall back to path otherwise
+	 *
+	 * 2. If the devid of the device matches the stored value, return
+	 *    success.
+	 *
+	 * 3. Otherwise, the device may have moved.  Try opening the device
+	 *    by the devid instead.
+	 *
+	 */
+	if (vd->vdev_devid != NULL) {
+		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+		    &dvd->vd_minor) != 0) {
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+	}
+
+	error = EINVAL;		/* presume failure */
+
+	if (vd->vdev_path != NULL) {
+		size_t len = strlen(vd->vdev_path) + 3;
+		char *buf = kmem_alloc(len, KM_SLEEP);
+		ddi_devid_t devid;
+
+		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+		/*
+		 * Try whole disk first, then slice name.
+		 */
+		if ((error = ldi_open_by_name(buf, spa_mode, kcred,
+		    &dvd->vd_lh, zfs_li)) != 0)
+			error = ldi_open_by_name(vd->vdev_path,
+			    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+		kmem_free(buf, len);
+
+		/*
+		 * Compare the devid to the stored value.
+		 */
+		if (error == 0 && vd->vdev_devid != NULL &&
+		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+				error = EINVAL;
+				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+				dvd->vd_lh = NULL;
+			}
+			ddi_devid_free(devid);
+		}
+	}
+
+	/*
+	 * If we were unable to open by path, or the devid check fails, open by
+	 * devid instead.
+	 */
+	if (error != 0 && vd->vdev_devid != NULL)
+		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	/*
+	 * Determine the actual size of the device.
+	 */
+	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (EINVAL);
+	}
+
+	*ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	if (dvd == NULL)
+		return;
+
+	dprintf("removing disk %s, devid %s\n",
+	    vd->vdev_path ? vd->vdev_path : "<none>",
+	    vd->vdev_devid ? vd->vdev_devid : "<none>");
+
+	if (dvd->vd_minor != NULL)
+		ddi_devid_str_free(dvd->vd_minor);
+
+	if (dvd->vd_devid != NULL)
+		ddi_devid_free(dvd->vd_devid);
+
+	if (dvd->vd_lh != NULL)
+		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+	kmem_free(dvd, sizeof (vdev_disk_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+	zio_t *zio = vdb->vdb_io;
+
+	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+		zio->io_error = EIO;
+
+	kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+	zio_t *zio = zio_arg;
+
+	zio->io_error = error;
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_buf_t *vdb;
+	buf_t *bp;
+	int flags, error;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		zio_vdev_io_bypass(zio);
+
+		/* XXPOLICY */
+		if (vdev_is_dead(vd)) {
+			zio->io_error = ENXIO;
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+
+		case DKIOCFLUSHWRITECACHE:
+
+			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+			zio->io_dk_callback.dkc_cookie = zio;
+
+			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+			    (uintptr_t)&zio->io_dk_callback,
+			    FKIOCTL, kcred, NULL);
+
+			if (error == 0) {
+				/*
+				 * The ioctl will be done asychronously,
+				 * and will call vdev_disk_ioctl_done()
+				 * upon completion.
+				 */
+				return;
+			}
+			zio->io_error = error;
+			break;
+
+		default:
+			zio->io_error = ENOTSUP;
+		}
+
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+		return;
+
+	if ((zio = vdev_queue_io(zio)) == NULL)
+		return;
+
+	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+	flags |= B_BUSY | B_NOCACHE;
+	if (zio->io_flags & ZIO_FLAG_FAILFAST)
+		flags |= B_FAILFAST;
+
+	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+	vdb->vdb_io = zio;
+	bp = &vdb->vdb_buf;
+
+	bioinit(bp);
+	bp->b_flags = flags;
+	bp->b_bcount = zio->io_size;
+	bp->b_un.b_addr = zio->io_data;
+	bp->b_lblkno = lbtodb(zio->io_offset);
+	bp->b_bufsize = zio->io_size;
+	bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+	/* XXPOLICY */
+	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (error) {
+		zio->io_error = error;
+		bioerror(bp, error);
+		bp->b_resid = bp->b_bcount;
+		bp->b_iodone(bp);
+		return;
+	}
+
+	error = ldi_strategy(dvd->vd_lh, bp);
+	/* ldi_strategy() will return non-zero only on programming errors */
+	ASSERT(error == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+	vdev_queue_io_done(zio);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		vdev_cache_write(zio);
+
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_disk_open,
+	vdev_disk_close,
+	vdev_default_asize,
+	vdev_disk_io_start,
+	vdev_disk_io_done,
+	NULL,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 0000000000..a789008e17
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_file_t *vf;
+	vnode_t *vp;
+	vattr_t vattr;
+	int error;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+#ifdef _KERNEL
+	/*
+	 * When using a file vdev in kernel context, the underlying filesystem
+	 * will already be caching the data.  Don't cache it again here.
+	 */
+	vd->vdev_cache.vc_size = 0;
+#endif
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
+	    0, &vp, 0, 0, rootdir);
+
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (vp->v_type != VREG) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (ENODEV);
+	}
+#endif
+
+	/*
+	 * Determine the physical size of the file.
+	 */
+	vattr.va_mask = AT_SIZE;
+	error = VOP_GETATTR(vp, &vattr, 0, kcred);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*psize = vattr.va_size;
+	*ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vf == NULL)
+		return;
+
+	if (vf->vf_vnode != NULL) {
+		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+		VN_RELE(vf->vf_vnode);
+	}
+
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	int error;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		zio_vdev_io_bypass(zio);
+
+		/* XXPOLICY */
+		if (vdev_is_dead(vd)) {
+			zio->io_error = ENXIO;
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+			    kcred);
+			dprintf("fsync(%s) = %d\n", vdev_description(vd),
+			    zio->io_error);
+			break;
+		default:
+			zio->io_error = ENOTSUP;
+		}
+
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+		return;
+
+	if ((zio = vdev_queue_io(zio)) == NULL)
+		return;
+
+	/* XXPOLICY */
+	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (error) {
+		zio->io_error = error;
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, &resid);
+
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = ENOSPC;
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_file_io_done(zio_t *zio)
+{
+	vdev_queue_io_done(zio);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		vdev_cache_write(zio);
+
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_file_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	VDEV_TYPE_FILE,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 0000000000..6671a68fa9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ *	1. Uniquely identify this device as part of a ZFS pool and confirm its
+ *	   identity within the pool.
+ *
+ * 	2. Verify that all the devices given in a configuration are present
+ *         within the pool.
+ *
+ * 	3. Determine the uberblock for the pool.
+ *
+ * 	4. In case of an import operation, determine the configuration of the
+ *         toplevel vdev of which it is a part.
+ *
+ * 	5. If an import operation cannot find all the devices in the pool,
+ *         provide enough information to the administrator to determine which
+ *         devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases.  The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point.  To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced.  Assuming we have
+ * labels and an uberblock with the following transacation groups:
+ *
+ *              L1          UB          L2
+ *           +------+    +------+    +------+
+ *           |      |    |      |    |      |
+ *           | t10  |    | t10  |    | t10  |
+ *           |      |    |      |    |      |
+ *           +------+    +------+    +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10).  Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 	1. For each vdev, update 'L1' to the new label
+ * 	2. Update the uberblock
+ * 	3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group.  If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid.  If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced.  If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool.  This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure.  The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information.  It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated.  When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * 	version		ZFS on-disk version
+ * 	name		Pool name
+ * 	state		Pool state
+ * 	txg		Transaction group in which this label was written
+ * 	pool_guid	Unique identifier for this pool
+ * 	vdev_tree	An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * 	top_guid	Unique ID for top-level vdev in which this is contained
+ * 	guid		Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+	uint64_t size, zio_done_func_t *done, void *private)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	zio_nowait(zio_read_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+	uint64_t size, zio_done_func_t *done, void *private)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	zio_nowait(zio_write_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(vdev_t *vd, int getstats)
+{
+	nvlist_t *nv = NULL;
+
+	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+
+	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+	    vd->vdev_ops->vdev_op_type) == 0);
+	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0);
+	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+	if (vd->vdev_path != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+		    vd->vdev_path) == 0);
+
+	if (vd->vdev_devid != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+		    vd->vdev_devid) == 0);
+
+	if (vd == vd->vdev_top) {
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vd->vdev_ms_array) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vd->vdev_ms_shift) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    vd->vdev_ashift) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    vd->vdev_asize) == 0);
+	}
+
+	if (vd->vdev_dtl.smo_object != 0)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+		    vd->vdev_dtl.smo_object) == 0);
+
+	if (getstats) {
+		vdev_stat_t vs;
+		vdev_get_stats(vd, &vs);
+		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		nvlist_t **child;
+		int c;
+
+		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+		    KM_SLEEP);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			child[c] = vdev_config_generate(vd->vdev_child[c],
+			    getstats);
+
+		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+		    child, vd->vdev_children) == 0);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			nvlist_free(child[c]);
+
+		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+	}
+
+	return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+	nvlist_t *config = NULL;
+	vdev_phys_t *vp;
+	uint64_t version;
+	zio_t *zio;
+	int l;
+
+	if (vdev_is_dead(vd))
+		return (NULL);
+
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+
+		zio = zio_root(vd->vdev_spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+
+		vdev_label_read(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t), NULL, NULL);
+
+		if (zio_wait(zio) == 0 &&
+		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+		    &config, 0) == 0 &&
+		    nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &version) == 0 &&
+		    version == UBERBLOCK_VERSION)
+			break;
+
+		if (config != NULL) {
+			nvlist_free(config);
+			config = NULL;
+		}
+	}
+
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+
+	return (config);
+}
+
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg)
+{
+	spa_t *spa = vd->vdev_spa;
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	vdev_boot_header_t *vb;
+	uberblock_phys_t *ubphys;
+	zio_t *zio;
+	int l, c, n;
+	char *buf;
+	size_t buflen;
+	int error;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((error = vdev_label_init(vd->vdev_child[c], crtxg)) != 0)
+			return (error);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (0);
+
+	/*
+	 * Make sure each leaf device is writable, and zero its initial content.
+	 * Along the way, also make sure that no leaf is already in use.
+	 * Note that it's important to do this sequentially, not in parallel,
+	 * so that we catch cases of multiple use of the same leaf vdev in
+	 * the vdev we're creating -- e.g. mirroring a disk with itself.
+	 */
+	if (vdev_is_dead(vd))
+		return (EIO);
+
+	/*
+	 * Check whether this device is already in use.
+	 * Ignore the check if crtxg == 0, which we use for device removal.
+	 */
+	if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
+		uint64_t version, state, pool_guid, device_guid, txg;
+		uint64_t mycrtxg = 0;
+
+		(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+		    &mycrtxg);
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
+		    &version) == 0 && version == UBERBLOCK_VERSION &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    &state) == 0 && state == POOL_STATE_ACTIVE &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+		    &pool_guid) == 0 &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+		    &device_guid) == 0 &&
+		    spa_guid_exists(pool_guid, device_guid) &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) == 0 && (txg != 0 || mycrtxg == crtxg)) {
+			dprintf("vdev %s in use, pool_state %d\n",
+			    vdev_description(vd), state);
+			nvlist_free(label);
+			return (EBUSY);
+		}
+		nvlist_free(label);
+	}
+
+	/*
+	 * The device isn't in use, so initialize its label.
+	 */
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+	bzero(vp, sizeof (vdev_phys_t));
+
+	/*
+	 * Generate a label describing the pool and our top-level vdev.
+	 * We mark it as being from txg 0 to indicate that it's not
+	 * really part of an active pool just yet.  The labels will
+	 * be written again with a meaningful txg by spa_sync().
+	 */
+	label = spa_config_generate(spa, vd, 0ULL, 0);
+
+	/*
+	 * Add our creation time.  This allows us to detect multiple vdev
+	 * uses as described above, and automatically expires if we fail.
+	 */
+	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0);
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+		nvlist_free(label);
+		zio_buf_free(vp, sizeof (vdev_phys_t));
+		return (EINVAL);
+	}
+
+	/*
+	 * Initialize boot block header.
+	 */
+	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+	bzero(vb, sizeof (vdev_boot_header_t));
+	vb->vb_magic = VDEV_BOOT_MAGIC;
+	vb->vb_version = VDEV_BOOT_VERSION;
+	vb->vb_offset = VDEV_BOOT_OFFSET;
+	vb->vb_size = VDEV_BOOT_SIZE;
+
+	/*
+	 * Initialize uberblock template.
+	 */
+	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+	bzero(ubphys, sizeof (uberblock_phys_t));
+	ubphys->ubp_uberblock = spa->spa_uberblock;
+	ubphys->ubp_uberblock.ub_txg = 0;
+
+	/*
+	 * Write everything in parallel.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+
+		vdev_label_write(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t), NULL, NULL);
+
+		vdev_label_write(zio, vd, l, vb,
+		    offsetof(vdev_label_t, vl_boot_header),
+		    sizeof (vdev_boot_header_t), NULL, NULL);
+
+		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+
+			vdev_label_write(zio, vd, l, ubphys,
+			    offsetof(vdev_label_t, vl_uberblock[n]),
+			    sizeof (uberblock_phys_t), NULL, NULL);
+
+		}
+	}
+
+	error = zio_wait(zio);
+
+	nvlist_free(label);
+	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+	zio_buf_free(vb, sizeof (vdev_boot_header_t));
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk.  We've
+ * written the first uberblock for txg + 1, and then we lose power.  When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline.  If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg.  The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+	if (ub1->ub_txg < ub2->ub_txg)
+		return (-1);
+	if (ub1->ub_txg > ub2->ub_txg)
+		return (1);
+
+	if (ub1->ub_timestamp < ub2->ub_timestamp)
+		return (-1);
+	if (ub1->ub_timestamp > ub2->ub_timestamp)
+		return (1);
+
+	return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+	uberblock_phys_t *ubphys = zio->io_data;
+	uberblock_t *ub = &ubphys->ubp_uberblock;
+	uberblock_t *ubbest = zio->io_private;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
+
+	if (uberblock_verify(ub) == 0) {
+		mutex_enter(&spa->spa_uberblock_lock);
+		if (vdev_uberblock_compare(ub, ubbest) > 0)
+			*ubbest = *ub;
+		mutex_exit(&spa->spa_uberblock_lock);
+	}
+
+	zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+	int l, c, n;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+			vdev_label_read(zio, vd, l,
+			    zio_buf_alloc(sizeof (uberblock_phys_t)),
+			    offsetof(vdev_label_t, vl_uberblock[n]),
+			    sizeof (uberblock_phys_t),
+			    vdev_uberblock_load_done, ubbest);
+		}
+	}
+}
+
+/*
+ * Write the uberblock to both labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_root->io_private;
+
+	if (zio->io_error == 0)
+		atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd,
+	uint64_t txg)
+{
+	int l, c, n;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	n = txg & (VDEV_UBERBLOCKS - 1);
+
+	ASSERT(ubphys->ubp_uberblock.ub_txg == txg);
+
+	for (l = 0; l < VDEV_LABELS; l++)
+		vdev_label_write(zio, vd, l, ubphys,
+		    offsetof(vdev_label_t, vl_uberblock[n]),
+		    sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL);
+
+	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+}
+
+static int
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg)
+{
+	uberblock_phys_t *ubphys;
+	uint64_t *good_writes;
+	zio_t *zio;
+	int error;
+
+	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+	bzero(ubphys, sizeof (uberblock_phys_t));
+	ubphys->ubp_uberblock = *ub;
+
+	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+	zio = zio_root(spa, NULL, good_writes,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	vdev_uberblock_sync(zio, ubphys, uvd, txg);
+
+	error = zio_wait(zio);
+
+	if (error && *good_writes != 0) {
+		dprintf("partial success: good_writes = %llu\n", *good_writes);
+		error = 0;
+	}
+
+	/*
+	 * It's possible to have no good writes and no error if every vdev is in
+	 * the CANT_OPEN state.
+	 */
+	if (*good_writes == 0 && error == 0)
+		error = EIO;
+
+	kmem_free(good_writes, sizeof (uint64_t));
+	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+
+	return (error);
+}
+
+/*
+ * Sync out an individual vdev.
+ */
+static void
+vdev_sync_label_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_root->io_private;
+
+	if (zio->io_error == 0)
+		atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	char *buf;
+	size_t buflen;
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	/*
+	 * Generate a label describing the top-level config to which we belong.
+	 */
+	label = spa_config_generate(vd->vdev_spa, vd, txg, 0);
+
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+	bzero(vp, sizeof (vdev_phys_t));
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+		vdev_label_write(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+		    vdev_sync_label_done, NULL);
+
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+	nvlist_free(label);
+
+	dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
+}
+
+static int
+vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+{
+	uint64_t *good_writes;
+	zio_t *zio;
+	int error;
+
+	ASSERT(vd == vd->vdev_top);
+
+	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+	zio = zio_root(vd->vdev_spa, NULL, good_writes,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	/*
+	 * Recursively kick off writes to all labels.
+	 */
+	vdev_sync_label(zio, vd, l, txg);
+
+	error = zio_wait(zio);
+
+	if (error && *good_writes != 0) {
+		dprintf("partial success: good_writes = %llu\n", *good_writes);
+		error = 0;
+	}
+
+	if (*good_writes == 0 && error == 0)
+		error = ENODEV;
+
+	kmem_free(good_writes, sizeof (uint64_t));
+
+	return (error);
+}
+
+/*
+ * Sync the entire vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent.  The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+spa_sync_labels(spa_t *spa, uint64_t txg)
+{
+	uberblock_t *ub = &spa->spa_uberblock;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd, *uvd;
+	zio_t *zio;
+	int c, l, error;
+
+	ASSERT(ub->ub_txg <= txg);
+
+	/*
+	 * If this isn't a resync due to I/O errors, and nothing changed
+	 * in this transaction group, and the vdev configuration hasn't changed,
+	 * and this isn't an explicit sync-all, then there's nothing to do.
+	 */
+	if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
+	    list_is_empty(&spa->spa_dirty_list)) {
+		dprintf("nothing to sync in %s in txg %llu\n",
+		    spa_name(spa), txg);
+		return (0);
+	}
+
+	if (txg > spa_freeze_txg(spa))
+		return (0);
+
+	dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
+
+	/*
+	 * Flush the write cache of every disk that's been written to
+	 * in this transaction group.  This ensures that all blocks
+	 * written in this txg will be committed to stable storage
+	 * before any uberblock that references them.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
+	 * system dies in the middle of this process, that's OK: all of the
+	 * even labels that made it to disk will be newer than any uberblock,
+	 * and will therefore be considered invalid.  The odd labels (L1, L3),
+	 * which have not yet been touched, will still be valid.
+	 */
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		for (l = 0; l < VDEV_LABELS; l++) {
+			if (l & 1)
+				continue;
+			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+				return (error);
+		}
+	}
+
+	/*
+	 * Flush the new labels to disk.  This ensures that all even-label
+	 * updates are committed to stable storage before the uberblock update.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
+	 * Otherwise, pick one top-level vdev at random.
+	 */
+	if (!list_is_empty(&spa->spa_dirty_list))
+		uvd = rvd;
+	else
+		uvd = rvd->vdev_child[spa_get_random(rvd->vdev_children)];
+
+	/*
+	 * Sync the uberblocks.  If the system dies in the middle of this
+	 * step, there are two cases to consider, and the on-disk state
+	 * is consistent either way:
+	 *
+	 * (1)	If none of the new uberblocks made it to disk, then the
+	 *	previous uberblock will be the newest, and the odd labels
+	 *	(which had not yet been touched) will be valid with respect
+	 *	to that uberblock.
+	 *
+	 * (2)	If one or more new uberblocks made it to disk, then they
+	 *	will be the newest, and the even labels (which had all
+	 *	been successfully committed) will be valid with respect
+	 *	to the new uberblocks.
+	 */
+	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+		return (error);
+
+	/*
+	 * Flush the uberblocks to disk.  This ensures that the odd labels
+	 * are no longer needed (because the new uberblocks and the even
+	 * labels are safely on disk), so it is safe to overwrite them.
+	 */
+	(void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL, ZIO_PRIORITY_NOW,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+
+	/*
+	 * Sync out odd labels for every dirty vdev.  If the system dies
+	 * in the middle of this process, the even labels and the new
+	 * uberblocks will suffice to open the pool.  The next time
+	 * the pool is opened, the first thing we'll do -- before any
+	 * user data is modified -- is mark every vdev dirty so that
+	 * all labels will be brought up to date.
+	 */
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		for (l = 0; l < VDEV_LABELS; l++) {
+			if ((l & 1) == 0)
+				continue;
+			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+				return (error);
+		}
+	}
+
+	/*
+	 * Flush the new labels to disk.  This ensures that all odd-label
+	 * updates are committed to stable storage before the next
+	 * transaction group begins.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * Clear the dirty list.
+	 */
+	while (!list_is_empty(&spa->spa_dirty_list))
+		vdev_config_clean(list_head(&spa->spa_dirty_list));
+
+#ifdef DEBUG
+	for (c = 0; c < rvd->vdev_children; c++) {
+		ASSERT(rvd->vdev_child[c]->vdev_is_dirty == 0);
+	}
+#endif
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 0000000000..45eb7ce78b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,414 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_map {
+	int	mm_error;
+	short	mm_tried;
+	short	mm_skipped;
+} mirror_map_t;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+	zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
+	    sizeof (mirror_map_t), KM_SLEEP);
+	return (zio->io_vsd);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+	kmem_free(zio->io_vsd,
+	    zio->io_vd->vdev_children * sizeof (mirror_map_t));
+	zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	uint64_t c;
+	int numerrors = 0;
+	int ret, lasterror = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((ret = vdev_open(cvd)) != 0) {
+			lasterror = ret;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*ashift = cvd->vdev_ashift;
+	}
+
+	if (numerrors == vd->vdev_children) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+	uint64_t c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_private;
+
+	mm->mm_error = zio->io_error;
+	mm->mm_tried = 1;
+	mm->mm_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_private;
+
+	if (zio->io_error == 0) {
+		zio_t *pio = zio->io_parent;
+		mutex_enter(&pio->io_lock);
+		bcopy(zio->io_data, pio->io_data, pio->io_size);
+		mutex_exit(&pio->io_lock);
+	}
+
+	zio_buf_free(zio->io_data, zio->io_size);
+
+	mm->mm_error = zio->io_error;
+	mm->mm_tried = 1;
+	mm->mm_skipped = 0;
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	uint64_t txg = zio->io_txg;
+	int i, c;
+
+	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+	/*
+	 * Select the child we'd like to read from absent any errors.
+	 * The current policy is to alternate sides at 8M granularity.
+	 * XXX -- investigate other policies for read distribution.
+	 */
+	c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
+
+	/*
+	 * If this is a replacing vdev, always try child 0 (the source) first.
+	 */
+	if (vd->vdev_ops == &vdev_replacing_ops)
+		c = 0;
+
+	/*
+	 * Try to find a child whose DTL doesn't contain the block to read.
+	 * If a child is known to be completely inaccessible (indicated by
+	 * vdev_is_dead() returning B_TRUE), don't even try.
+	 */
+	for (i = 0; i < vd->vdev_children; i++, c++) {
+		if (c >= vd->vdev_children)
+			c = 0;
+		if (mm[c].mm_tried || mm[c].mm_skipped)
+			continue;
+		cvd = vd->vdev_child[c];
+		if (vdev_is_dead(cvd)) {
+			mm[c].mm_error = ENXIO;
+			mm[c].mm_tried = 1;	/* don't even try */
+			mm[c].mm_skipped = 1;
+			continue;
+		}
+		if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+			return (c);
+		mm[c].mm_error = ESTALE;
+		mm[c].mm_skipped = 1;
+	}
+
+	/*
+	 * Every device is either missing or has this txg in its DTL.
+	 * If we don't have any sibling replicas to consult, look for
+	 * any child we haven't already tried before giving up.
+	 */
+	if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			if (!mm[c].mm_tried)
+				return (c);
+		}
+	}
+
+	/*
+	 * Every child failed.  There's no place left to look.
+	 */
+	return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	mirror_map_t *mm;
+	int c, children;
+
+	mm = vdev_mirror_map_alloc(zio);
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_flags & ZIO_FLAG_SCRUB) {
+			/*
+			 * For scrubbing reads we need to allocate a read
+			 * buffer for each child and issue reads to all
+			 * children.  If any child succeeds, it will copy its
+			 * data into zio->io_data in vdev_mirror_scrub_done.
+			 */
+			for (c = 0; c < vd->vdev_children; c++) {
+				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+				    vd->vdev_child[c], zio->io_offset,
+				    zio_buf_alloc(zio->io_size), zio->io_size,
+				    zio->io_type, zio->io_priority,
+				    ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
+				    &mm[c]));
+			}
+			zio_wait_children_done(zio);
+			return;
+		}
+		/*
+		 * For normal reads just pick one child.
+		 */
+		c = vdev_mirror_child_select(zio);
+		children = (c >= 0);
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+		/*
+		 * If this is a resilvering I/O to a replacing vdev,
+		 * only the last child should be written -- unless the
+		 * first child happens to have a DTL entry here as well.
+		 * All other writes go to all children.
+		 */
+		if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
+		    vd->vdev_ops == &vdev_replacing_ops &&
+		    !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+		    zio->io_txg, 1)) {
+			c = vd->vdev_children - 1;
+			children = 1;
+		} else {
+			c = 0;
+			children = vd->vdev_children;
+		}
+	}
+
+	while (children--) {
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    vd->vdev_child[c], zio->io_offset, zio->io_data,
+		    zio->io_size, zio->io_type, zio->io_priority,
+		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+		c++;
+	}
+
+	zio_wait_children_done(zio);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	mirror_map_t *mm = zio->io_vsd;
+	int c;
+	int good_copies = 0;
+	int unexpected_errors = 0;
+
+	ASSERT(mm != NULL);
+
+	zio->io_error = 0;
+	zio->io_numerrors = 0;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		if (mm[c].mm_tried && mm[c].mm_error == 0) {
+			good_copies++;
+			continue;
+		}
+
+		/*
+		 * We preserve any EIOs because those may be worth retrying;
+		 * whereas ECKSUM and ENXIO are more likely to be persistent.
+		 */
+		if (mm[c].mm_error) {
+			if (zio->io_error != EIO)
+				zio->io_error = mm[c].mm_error;
+			if (!mm[c].mm_skipped)
+				unexpected_errors++;
+			zio->io_numerrors++;
+		}
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		/*
+		 * XXX -- for now, treat partial writes as success.
+		 */
+		/* XXPOLICY */
+		if (good_copies != 0)
+			zio->io_error = 0;
+		ASSERT(mm != NULL);
+		vdev_mirror_map_free(zio);
+		zio_next_stage(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/*
+	 * If we don't have a good copy yet, keep trying other children.
+	 */
+	/* XXPOLICY */
+	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+		ASSERT(c >= 0 && c < vd->vdev_children);
+		cvd = vd->vdev_child[c];
+		dprintf("%s: retrying i/o (err=%d) on child %s\n",
+		    vdev_description(zio->io_vd), zio->io_error,
+		    vdev_description(cvd));
+		zio->io_error = 0;
+		zio_vdev_io_redone(zio);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+		    zio->io_offset, zio->io_data, zio->io_size,
+		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+		    vdev_mirror_child_done, &mm[c]));
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	/* XXPOLICY */
+	if (good_copies)
+		zio->io_error = 0;
+	else
+		ASSERT(zio->io_error != 0);
+
+	if (good_copies && (spa_mode & FWRITE) &&
+	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (c = 0; c < vd->vdev_children; c++) {
+			/*
+			 * Don't rewrite known good children.
+			 * Not only is it unnecessary, it could
+			 * actually be harmful: if the system lost
+			 * power while rewriting the only good copy,
+			 * there would be no good copies left!
+			 */
+			cvd = vd->vdev_child[c];
+
+			if (mm[c].mm_error == 0) {
+				if (mm[c].mm_tried)
+					continue;
+				if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+				    zio->io_txg, 1))
+					continue;
+				mm[c].mm_error = ESTALE;
+			}
+
+			dprintf("%s resilvered %s @ 0x%llx error %d\n",
+			    vdev_description(vd),
+			    vdev_description(cvd),
+			    zio->io_offset, mm[c].mm_error);
+
+			zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+			    zio->io_offset, zio->io_data, zio->io_size,
+			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+		}
+	}
+
+	vdev_mirror_map_free(zio);
+	zio_next_stage(zio);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted == vd->vdev_children)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+	vdev_mirror_open,
+	vdev_mirror_close,
+	vdev_default_asize,
+	vdev_mirror_io_start,
+	vdev_mirror_io_done,
+	vdev_mirror_state_change,
+	VDEV_TYPE_MIRROR,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+	vdev_mirror_open,
+	vdev_mirror_close,
+	vdev_default_asize,
+	vdev_mirror_io_start,
+	vdev_mirror_io_done,
+	vdev_mirror_state_change,
+	VDEV_TYPE_REPLACING,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 0000000000..b35f4a5bcd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import.  It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing.  We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	/*
+	 * Really this should just fail.  But then the root vdev will be in the
+	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
+	 * will fail the GUID sum check before ever trying to open the pool.
+	 */
+	*psize = SPA_MINDEVSIZE;
+	*ashift = SPA_MINBLOCKSHIFT;
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+	zio->io_error = ENOTSUP;
+	zio_next_stage_async(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_missing_ops = {
+	vdev_missing_open,
+	vdev_missing_close,
+	vdev_default_asize,
+	vdev_missing_io_start,
+	vdev_missing_io_done,
+	NULL,
+	VDEV_TYPE_MISSING,	/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 0000000000..09831e1504
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_deadline < z2->io_deadline)
+		return (-1);
+	if (z1->io_deadline > z2->io_deadline)
+		return (1);
+
+	if (z1->io_offset < z2->io_offset)
+		return (-1);
+	if (z1->io_offset > z2->io_offset)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_offset < z2->io_offset)
+		return (-1);
+	if (z1->io_offset > z2->io_offset)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+
+	avl_destroy(&vq->vq_deadline_tree);
+	avl_destroy(&vq->vq_read_tree);
+	avl_destroy(&vq->vq_write_tree);
+	avl_destroy(&vq->vq_pending_tree);
+
+	mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+	zio_t *dio;
+	uint64_t offset = 0;
+
+	while ((dio = aio->io_delegate_list) != NULL) {
+		if (aio->io_type == ZIO_TYPE_READ)
+			bcopy((char *)aio->io_data + offset, dio->io_data,
+			    dio->io_size);
+		offset += dio->io_size;
+		aio->io_delegate_list = dio->io_delegate_next;
+		dio->io_delegate_next = NULL;
+		dio->io_error = aio->io_error;
+		zio_next_stage(dio);
+	}
+	ASSERT3U(offset, ==, aio->io_size);
+
+	zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define	IS_ADJACENT(io, nio) \
+	((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+	zio_issue_func_t **funcp)
+{
+	zio_t *fio, *lio, *aio, *dio;
+	avl_tree_t *tree;
+	uint64_t size;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	*funcp = NULL;
+
+	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+	    avl_numnodes(&vq->vq_deadline_tree) == 0)
+		return (NULL);
+
+	fio = lio = avl_first(&vq->vq_deadline_tree);
+
+	tree = fio->io_vdev_tree;
+	size = fio->io_size;
+
+	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+	    size + dio->io_size <= vq->vq_agg_limit) {
+		dio->io_delegate_next = fio;
+		fio = dio;
+		size += dio->io_size;
+	}
+
+	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+	    size + dio->io_size <= vq->vq_agg_limit) {
+		lio->io_delegate_next = dio;
+		lio = dio;
+		size += dio->io_size;
+	}
+
+	if (fio != lio) {
+		char *buf = zio_buf_alloc(size);
+		uint64_t offset = 0;
+		int nagg = 0;
+
+		ASSERT(size <= vq->vq_agg_limit);
+
+		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+		    fio->io_offset, buf, size, fio->io_type,
+		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+		    vdev_queue_agg_io_done, NULL);
+
+		aio->io_delegate_list = fio;
+
+		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+			ASSERT(dio->io_type == aio->io_type);
+			if (dio->io_type == ZIO_TYPE_WRITE)
+				bcopy(dio->io_data, buf + offset, dio->io_size);
+			offset += dio->io_size;
+			avl_remove(&vq->vq_deadline_tree, dio);
+			avl_remove(tree, dio);
+			zio_vdev_io_bypass(dio);
+			nagg++;
+		}
+
+		ASSERT(offset == size);
+
+		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
+		    "old=%5llx  new=%5llx\n",
+		    zio_type_name[fio->io_type],
+		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+		avl_add(&vq->vq_pending_tree, aio);
+
+		*funcp = zio_nowait;
+		return (aio);
+	}
+
+	avl_remove(&vq->vq_deadline_tree, fio);
+	avl_remove(tree, fio);
+
+	avl_add(&vq->vq_pending_tree, fio);
+
+	*funcp = zio_next_stage;
+
+	return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+	zio_issue_func_t *func;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+		return (zio);
+
+	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		zio->io_vdev_tree = &vq->vq_read_tree;
+	else
+		zio->io_vdev_tree = &vq->vq_write_tree;
+
+	mutex_enter(&vq->vq_lock);
+
+	zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
+	    zio->io_priority;
+
+	avl_add(&vq->vq_deadline_tree, zio);
+	avl_add(zio->io_vdev_tree, zio);
+
+	nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
+
+	mutex_exit(&vq->vq_lock);
+
+	if (nio == NULL || func != zio_nowait)
+		return (nio);
+
+	func(nio);
+	return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+	zio_issue_func_t *func;
+	int i;
+
+	mutex_enter(&vq->vq_lock);
+
+	avl_remove(&vq->vq_pending_tree, zio);
+
+	for (i = 0; i < vq->vq_ramp_rate; i++) {
+		nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func);
+		if (nio == NULL)
+			break;
+		mutex_exit(&vq->vq_lock);
+		if (func == zio_next_stage)
+			zio_vdev_io_reissue(nio);
+		func(nio);
+		mutex_enter(&vq->vq_lock);
+	}
+
+	mutex_exit(&vq->vq_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 0000000000..54547a3c97
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,599 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ */
+
+/*
+ * We currently allow up to two-way replication (i.e. single-fault
+ * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
+ * must all be multiples of two times the leaf vdev blocksize.
+ */
+#define	VDEV_RAIDZ_ALIGN	2ULL
+
+typedef struct raidz_col {
+	uint64_t	rc_col;
+	uint64_t	rc_offset;
+	uint64_t	rc_size;
+	void		*rc_data;
+	int		rc_error;
+	short		rc_tried;
+	short		rc_skipped;
+} raidz_col_t;
+
+typedef struct raidz_map {
+	uint64_t	rm_cols;
+	uint64_t	rm_bigcols;
+	uint64_t	rm_asize;
+	int		rm_missing_child;
+	int		rm_type;
+	int		rm_firstdatacol;
+	raidz_col_t	rm_col[1];
+} raidz_map_t;
+
+#define	RAIDZ_SINGLE	0
+#define	RAIDZ_PARITY	1
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+	int raid_type)
+{
+	raidz_map_t *rm;
+	uint64_t b = zio->io_offset >> unit_shift;
+	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t f = b % dcols;
+	uint64_t o = (b / dcols) << unit_shift;
+	uint64_t q, r, c, bc, col, acols, coff;
+	int firstdatacol;
+
+	switch (raid_type) {
+	case RAIDZ_SINGLE:
+		q = s / dcols;
+		r = s - q * dcols;
+		bc = r;
+		firstdatacol = 0;
+		break;
+	case RAIDZ_PARITY:
+		q = s / (dcols - 1);
+		r = s - q * (dcols - 1);
+		bc = r + !!r;
+		firstdatacol = 1;
+		break;
+	}
+
+	acols = (q == 0 ? bc : dcols);
+
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+	rm->rm_cols = acols;
+	rm->rm_bigcols = bc;
+	rm->rm_asize = 0;
+	rm->rm_missing_child = -1;
+	rm->rm_type = raid_type;
+	rm->rm_firstdatacol = firstdatacol;
+
+	for (c = 0; c < acols; c++) {
+		col = f + c;
+		coff = o;
+		if (col >= dcols) {
+			col -= dcols;
+			coff += 1ULL << unit_shift;
+		}
+		rm->rm_col[c].rc_col = col;
+		rm->rm_col[c].rc_offset = coff;
+		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+		rm->rm_col[c].rc_data = NULL;
+		rm->rm_col[c].rc_error = 0;
+		rm->rm_col[c].rc_tried = 0;
+		rm->rm_col[c].rc_skipped = 0;
+		rm->rm_asize += rm->rm_col[c].rc_size;
+	}
+
+	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+	rm->rm_col[c].rc_data = zio->io_data;
+
+	for (c = c + 1; c < acols; c++)
+		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+		    rm->rm_col[c - 1].rc_size;
+
+	if (raid_type == RAIDZ_PARITY) {
+		/*
+		 * To prevent hot parity disks, switch the parity and data
+		 * columns every 1MB.
+		 */
+		ASSERT(rm->rm_cols >= 2);
+		ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+		if (zio->io_offset & (1ULL << 20)) {
+			col = rm->rm_col[0].rc_col;
+			o = rm->rm_col[0].rc_offset;
+			rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
+			rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+			rm->rm_col[1].rc_col = col;
+			rm->rm_col[1].rc_offset = o;
+		}
+	}
+
+	zio->io_vsd = rm;
+	return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	int c;
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+	zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_reconstruct(raidz_map_t *rm, int x)
+{
+	uint64_t *dst, *src, count, xsize, csize;
+	int i, c;
+
+	for (c = 0; c < rm->rm_cols; c++) {
+		if (c == x)
+			continue;
+		src = rm->rm_col[c].rc_data;
+		dst = rm->rm_col[x].rc_data;
+		csize = rm->rm_col[c].rc_size;
+		xsize = rm->rm_col[x].rc_size;
+		count = MIN(csize, xsize) / sizeof (uint64_t);
+		if (c == !x) {
+			/*
+			 * The initial copy happens at either c == 0 or c == 1.
+			 * Both of these columns are 'big' columns, so we'll
+			 * definitely initialize all of column x.
+			 */
+			ASSERT3U(xsize, <=, csize);
+			for (i = 0; i < count; i++)
+				*dst++ = *src++;
+		} else {
+			for (i = 0; i < count; i++)
+				*dst++ ^= *src++;
+		}
+	}
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	int c, error;
+	int lasterror = 0;
+	int numerrors = 0;
+
+	/*
+	 * XXX -- minimum children should be raid-type-specific
+	 */
+	if (vd->vdev_children < 2) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((error = vdev_open(cvd)) != 0) {
+			lasterror = error;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*ashift = cvd->vdev_ashift;
+	}
+
+	*asize *= vd->vdev_children;
+
+	if (numerrors > 1) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize;
+	uint64_t cols = vd->vdev_children;
+
+	/*
+	 * These calculations assume RAIDZ_PARITY.
+	 */
+	asize = psize >> vd->vdev_ashift;
+	asize += (asize + cols - 2) / (cols - 1);
+	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
+
+	return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+	raidz_col_t *rc = zio->io_private;
+
+	rc->rc_error = zio->io_error;
+	rc->rc_tried = 1;
+	rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+	zio_buf_free(zio->io_data, zio->io_size);
+}
+
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	blkptr_t *bp = zio->io_bp;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c;
+
+	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children,
+	    RAIDZ_PARITY);
+
+	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
+		ASSERT3U(rm->rm_asize, ==,
+		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
+		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+	} else {
+		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
+		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+
+		/*
+		 * Generate RAID parity in virtual column 0.
+		 */
+		vdev_raidz_reconstruct(rm, 0);
+
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_col];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	for (c = rm->rm_cols - 1; c >= 0; c--) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_col];
+		if (vdev_is_dead(cvd)) {
+			rm->rm_missing_child = c;
+			rc->rc_error = ENXIO;
+			rc->rc_tried = 1;	/* don't even try */
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+			rm->rm_missing_child = c;
+			rc->rc_error = ESTALE;
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
+		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+	}
+
+	zio_wait_children_done(zio);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+	blkptr_t *bp = zio->io_bp;
+	int unexpected_errors = 0;
+	int c;
+
+	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
+
+	zio->io_error = 0;
+	zio->io_numerrors = 0;
+
+	for (c = 0; c < rm->rm_cols; c++) {
+		rc = &rm->rm_col[c];
+
+		/*
+		 * We preserve any EIOs because those may be worth retrying;
+		 * whereas ECKSUM and ENXIO are more likely to be persistent.
+		 */
+		if (rc->rc_error) {
+			if (zio->io_error != EIO)
+				zio->io_error = rc->rc_error;
+			if (!rc->rc_skipped)
+				unexpected_errors++;
+			zio->io_numerrors++;
+		}
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		/*
+		 * If this is not a failfast write, and we were able to
+		 * write enough columns to reconstruct the data, good enough.
+		 */
+		/* XXPOLICY */
+		if (zio->io_numerrors <= rm->rm_firstdatacol &&
+		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
+			zio->io_error = 0;
+
+		vdev_raidz_map_free(zio);
+		zio_next_stage(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/*
+	 * If there were no I/O errors, and the data checksums correctly,
+	 * the read is complete.
+	 */
+	/* XXPOLICY */
+	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
+		ASSERT(unexpected_errors == 0);
+		ASSERT(zio->io_error == 0);
+
+		/*
+		 * We know the data's good.  If we read the parity,
+		 * verify that it's good as well.  If not, fix it.
+		 */
+		for (c = 0; c < rm->rm_firstdatacol; c++) {
+			void *orig;
+			rc = &rm->rm_col[c];
+			if (!rc->rc_tried)
+				continue;
+			orig = zio_buf_alloc(rc->rc_size);
+			bcopy(rc->rc_data, orig, rc->rc_size);
+			vdev_raidz_reconstruct(rm, c);
+			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
+				vdev_checksum_error(zio,
+				    vd->vdev_child[rc->rc_col]);
+				rc->rc_error = ECKSUM;
+				unexpected_errors++;
+			}
+			zio_buf_free(orig, rc->rc_size);
+		}
+		goto done;
+	}
+
+	/*
+	 * If there was exactly one I/O error, it's the one we expected,
+	 * and the reconstructed data checksums, the read is complete.
+	 * This happens when one child is offline and vdev_fault_assess()
+	 * knows it, or when one child has stale data and the DTL knows it.
+	 */
+	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
+		rc = &rm->rm_col[c];
+		ASSERT(unexpected_errors == 0);
+		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
+		vdev_raidz_reconstruct(rm, c);
+		if (zio_checksum_error(zio) == 0) {
+			zio->io_error = 0;
+			goto done;
+		}
+	}
+
+	/*
+	 * This isn't a typical error -- either we got a read error or
+	 * more than one child claimed a problem.  Read every block we
+	 * haven't already so we can try combinatorial reconstruction.
+	 */
+	unexpected_errors = 1;
+	rm->rm_missing_child = -1;
+
+	for (c = 0; c < rm->rm_cols; c++)
+		if (!rm->rm_col[c].rc_tried)
+			break;
+
+	if (c != rm->rm_cols) {
+		zio->io_error = 0;
+		zio_vdev_io_redone(zio);
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			if (rc->rc_tried)
+				continue;
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    vd->vdev_child[rc->rc_col],
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	/*
+	 * If there were more errors than parity disks, give up.
+	 */
+	if (zio->io_numerrors > rm->rm_firstdatacol) {
+		ASSERT(zio->io_error != 0);
+		goto done;
+	}
+
+	/*
+	 * The number of I/O errors is correctable.  Correct them here.
+	 */
+	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
+	for (c = 0; c < rm->rm_cols; c++) {
+		rc = &rm->rm_col[c];
+		ASSERT(rc->rc_tried);
+		if (rc->rc_error) {
+			vdev_raidz_reconstruct(rm, c);
+			if (zio_checksum_error(zio) == 0)
+				zio->io_error = 0;
+			else
+				zio->io_error = rc->rc_error;
+			goto done;
+		}
+	}
+
+	/*
+	 * There were no I/O errors, but the data doesn't checksum.
+	 * Try all permutations to see if we can find one that does.
+	 */
+	ASSERT(zio->io_numerrors == 0);
+	for (c = 0; c < rm->rm_cols; c++) {
+		void *orig;
+		rc = &rm->rm_col[c];
+
+		orig = zio_buf_alloc(rc->rc_size);
+		bcopy(rc->rc_data, orig, rc->rc_size);
+		vdev_raidz_reconstruct(rm, c);
+
+		if (zio_checksum_error(zio) == 0) {
+			zio_buf_free(orig, rc->rc_size);
+			zio->io_error = 0;
+			/*
+			 * If this child didn't know that it returned bad data,
+			 * inform it.
+			 */
+			if (rc->rc_tried && rc->rc_error == 0)
+				vdev_checksum_error(zio,
+				    vd->vdev_child[rc->rc_col]);
+			rc->rc_error = ECKSUM;
+			goto done;
+		}
+
+		bcopy(orig, rc->rc_data, rc->rc_size);
+		zio_buf_free(orig, rc->rc_size);
+	}
+
+	/*
+	 * All combinations failed to checksum.
+	 */
+	zio->io_error = ECKSUM;
+
+done:
+	zio_checksum_verified(zio);
+
+	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_col];
+
+			if (rc->rc_error) {
+				/*
+				 * Make a copy of the data because we're
+				 * going to free the RAID-Z map below.
+				 */
+				void *data = zio_buf_alloc(rc->rc_size);
+				bcopy(rc->rc_data, data, rc->rc_size);
+
+				dprintf("%s resilvered %s @ 0x%llx error %d\n",
+				    vdev_description(vd),
+				    vdev_description(cvd),
+				    zio->io_offset, rc->rc_error);
+
+				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+				    rc->rc_offset, data, rc->rc_size,
+				    ZIO_TYPE_WRITE, zio->io_priority,
+				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+				    ZIO_FLAG_DONT_PROPAGATE,
+				    vdev_raidz_repair_done, NULL));
+			}
+		}
+	}
+
+	vdev_raidz_map_free(zio);
+	zio_next_stage(zio);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted > 1)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+	vdev_raidz_open,
+	vdev_raidz_close,
+	vdev_raidz_asize,
+	vdev_raidz_io_start,
+	vdev_raidz_io_done,
+	vdev_raidz_state_change,
+	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 0000000000..4e44b5bb05
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,98 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	int c, error;
+	int lasterror = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((error = vdev_open(cvd)) != 0) {
+			lasterror = error;
+			continue;
+		}
+
+		*asize += cvd->vdev_asize;
+		*ashift = MAX(*ashift, cvd->vdev_ashift);
+	}
+
+	if (lasterror)
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+
+	return (lasterror);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted > 0)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_root_ops = {
+	vdev_root_open,
+	vdev_root_close,
+	vdev_default_asize,
+	NULL,			/* io_start - not applicable to the root */
+	NULL,			/* io_done - not applicable to the root */
+	vdev_root_state_change,
+	VDEV_TYPE_ROOT,		/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
new file mode 100644
index 0000000000..1eddb9c250
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1010 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+#define	MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10)
+
+static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
+static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
+static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+    dmu_tx_t *tx, krw_t lt);
+static void zap_put_leaf(zap_leaf_t *l);
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+	uint64_t block_type;
+
+	ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+	block_type = *(uint64_t *)vbuf;
+
+	switch (block_type) {
+	case ZBT_LEAF:
+	case BSWAP_64(ZBT_LEAF):
+		zap_leaf_byteswap(vbuf);
+		return;
+	case ZBT_HEADER:
+	case BSWAP_64(ZBT_HEADER):
+	default:
+		/* it's a ptrtbl block */
+		byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT);
+		return;
+	}
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	zap_leaf_t *l;
+	int i;
+	zap_phys_t *zp;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	zap->zap_ismicro = FALSE;
+
+	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+	    &zap->zap_f.zap_phys, zap_pageout);
+
+	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+
+	zp = zap->zap_f.zap_phys;
+	/*
+	 * explicitly zero it since it might be coming from an
+	 * initialized microzap
+	 */
+	ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
+	bzero(zp, sizeof (zap_phys_t));
+	zp->zap_block_type = ZBT_HEADER;
+	zp->zap_magic = ZAP_MAGIC;
+
+	zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT;
+
+	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
+	zp->zap_num_leafs = 1;
+	zp->zap_num_entries = 0;
+	zp->zap_salt = zap->zap_salt;
+
+	for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++)
+		zp->zap_leafs[i] = 1;	/* block 1 will be the first leaf */
+
+	/*
+	 * set up block 1 - the first leaf
+	 */
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    1<<ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db, tx);
+
+	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l->l_dbuf = db;
+	l->l_phys = db->db_data;
+
+	zap_leaf_init(l);
+
+	kmem_free(l, sizeof (zap_leaf_t));
+	dmu_buf_rele(db);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		return (1);
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static void
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+    dmu_tx_t *tx)
+{
+	uint64_t b, newblk;
+	dmu_buf_t *db_old, *db_new;
+	int hepb = 1<<(ZAP_BLOCK_SHIFT-4);
+	/* hepb = half the number of entries in a block */
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+	ASSERT(tbl->zt_numblks > 0);
+
+	if (tbl->zt_nextblk != 0) {
+		newblk = tbl->zt_nextblk;
+	} else {
+		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
+		tbl->zt_nextblk = newblk;
+		ASSERT3U(tbl->zt_blks_copied, ==, 0);
+		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks <<
+		    ZAP_BLOCK_SHIFT);
+	}
+
+	/*
+	 * Copy the ptrtbl from the old to new location, leaving the odd
+	 * entries blank as we go.
+	 */
+
+	b = tbl->zt_blks_copied;
+	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT);
+	dmu_buf_read(db_old);
+
+	/* first half of entries in old[b] go to new[2*b+0] */
+	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+0) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func(db_old->db_data, db_new->db_data, hepb);
+	dmu_buf_rele(db_new);
+
+	/* second half of entries in old[b] go to new[2*b+1] */
+	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+1) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func((uint64_t *)db_old->db_data + hepb,
+	    db_new->db_data, hepb);
+	dmu_buf_rele(db_new);
+
+	dmu_buf_rele(db_old);
+
+	tbl->zt_blks_copied++;
+
+	dprintf("copied block %llu of %llu\n",
+	    tbl->zt_blks_copied, tbl->zt_numblks);
+
+	if (tbl->zt_blks_copied == tbl->zt_numblks) {
+		dmu_free_range(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << ZAP_BLOCK_SHIFT,
+		    tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx);
+
+		tbl->zt_blk = newblk;
+		tbl->zt_numblks *= 2;
+		tbl->zt_shift++;
+		tbl->zt_nextblk = 0;
+		tbl->zt_blks_copied = 0;
+
+		dprintf("finished; numblocks now %llu (%lluk entries)\n",
+		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+	}
+}
+
+static uint64_t
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+    dmu_tx_t *tx)
+{
+	uint64_t blk, off, oldval;
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+
+	dprintf("storing %llx at index %llx\n", val, idx);
+
+	blk = idx >> (ZAP_BLOCK_SHIFT-3);
+	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db, tx);
+	oldval = ((uint64_t *)db->db_data)[off];
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db);
+
+	if (tbl->zt_nextblk != 0) {
+		idx *= 2;
+		blk = idx >> (ZAP_BLOCK_SHIFT-3);
+		off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT);
+		dmu_buf_will_dirty(db, tx);
+		((uint64_t *)db->db_data)[off] = val;
+		((uint64_t *)db->db_data)[off+1] = val;
+		dmu_buf_rele(db);
+	}
+
+	return (oldval);
+}
+
+static uint64_t
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+{
+	uint64_t blk, off, val;
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	blk = idx >> (ZAP_BLOCK_SHIFT-3);
+	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+	dmu_buf_read(db);
+	val = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db);
+	return (val);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+	int i;
+	for (i = 0; i < n; i++) {
+		uint64_t lb = src[i];
+		dst[2*i+0] = lb;
+		dst[2*i+1] = lb;
+	}
+}
+
+static void
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
+		return;
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		/*
+		 * The ptrtbl can no longer be contained in the
+		 * header block.  Give it its own entire block, which
+		 * will quadruple the size of the ptrtbl.
+		 */
+		uint64_t newblk;
+		dmu_buf_t *db_new;
+
+		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		    ZAP_PTRTBL_MIN_SHIFT);
+		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+		newblk = zap_allocate_blocks(zap, 1, tx);
+		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    newblk << ZAP_BLOCK_SHIFT);
+
+		dmu_buf_will_dirty(db_new, tx);
+		zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs,
+		    db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT);
+		dmu_buf_rele(db_new);
+
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+		    (ZAP_BLOCK_SHIFT-3));
+	} else {
+		zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    zap_ptrtbl_transfer, tx);
+	}
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+
+	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+
+	zap->zap_f.zap_phys->zap_num_entries += delta;
+
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
+{
+	uint64_t newblk;
+	ASSERT(tx != NULL);
+	if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	}
+	newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
+	    nblocks;
+	return (newblk);
+}
+
+
+/*
+ * This function doesn't increment zap_num_leafs because it's used to
+ * allocate a leaf chain, which doesn't count against zap_num_leafs.
+ * The directory must be held exclusively for this tx.
+ */
+zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+	void *winner;
+	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	/* hence we already dirtied zap->zap_dbuf */
+
+	rw_init(&l->l_rwlock, 0, 0, 0);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = zap_allocate_blocks(zap, 1, tx);
+	l->l_next = NULL;
+	l->l_dbuf = NULL;
+	l->l_phys = NULL;
+
+	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    l->l_blkid << ZAP_BLOCK_SHIFT);
+	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+	ASSERT(winner == NULL);
+	dmu_buf_will_dirty(l->l_dbuf, tx);
+
+	zap_leaf_init(l);
+
+	return (l);
+}
+
+/* ARGSUSED */
+void
+zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+{
+	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf);
+	/* XXX there are still holds on this block, so we can't free it? */
+	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
+	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+	ASSERT(!zap->zap_ismicro);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+	*count = zap->zap_f.zap_phys->zap_num_entries;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+	return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+static void
+zap_put_leaf(zap_leaf_t *l)
+{
+	zap_leaf_t *nl = l->l_next;
+	while (nl) {
+		zap_leaf_t *nnl = nl->l_next;
+		rw_exit(&nl->l_rwlock);
+		dmu_buf_rele(nl->l_dbuf);
+		nl = nnl;
+	}
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+	zap_leaf_t *l = vl;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+	zap_leaf_t *l, *winner;
+
+	ASSERT(blkid != 0);
+
+	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	rw_init(&l->l_rwlock, 0, 0, 0);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = blkid;
+	l->l_next = NULL;
+	l->l_dbuf = db;
+	l->l_phys = NULL;
+
+	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+	rw_exit(&l->l_rwlock);
+	if (winner != NULL) {
+		/* someone else set it first */
+		zap_leaf_pageout(NULL, l);
+		l = winner;
+	}
+
+	return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+	dmu_buf_t *db;
+	zap_leaf_t *l;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    blkid << ZAP_BLOCK_SHIFT);
+
+	ASSERT3U(db->db_object, ==, zap->zap_object);
+	ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
+	ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT);
+	ASSERT(blkid != 0);
+
+	dmu_buf_read(db);
+	l = dmu_buf_get_user(db);
+
+	if (l == NULL)
+		l = zap_open_leaf(blkid, db);
+
+	rw_enter(&l->l_rwlock, lt);
+	/*
+	 * Must lock before dirtying, otherwise l->l_phys could change,
+	 * causing ASSERT below to fail.
+	 */
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+	ASSERT3U(l->l_blkid, ==, blkid);
+	ASSERT3P(l->l_dbuf, ==, db);
+	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+	zap_leaf_t *l, *nl;
+
+	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+
+	nl = l;
+	while (nl->lh_next != 0) {
+		zap_leaf_t *nnl;
+		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+		nl->l_next = nnl;
+		nl = nnl;
+	}
+
+	return (l);
+}
+
+static uint64_t
+zap_idx_to_blk(zap_t *zap, uint64_t idx)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		ASSERT3U(idx, <,
+		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+		return (zap->zap_f.zap_phys->zap_leafs[idx]);
+	} else {
+		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx));
+	}
+}
+
+static void
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+		zap->zap_f.zap_phys->zap_leafs[idx] = blk;
+	} else {
+		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx, blk, tx);
+	}
+}
+
+static zap_leaf_t *
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+{
+	uint64_t idx;
+	zap_leaf_t *l;
+
+	ASSERT(zap->zap_dbuf == NULL ||
+	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
+
+	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+
+	return (l);
+}
+
+
+static zap_leaf_t *
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+{
+	zap_leaf_t *nl;
+	int prefix_diff, i, err;
+	uint64_t sibling;
+
+	ASSERT3U(l->lh_prefix_len, <=,
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+	if (zap_tryupgradedir(zap, tx) == 0) {
+		/* failed to upgrade */
+		int old_prefix_len = l->lh_prefix_len;
+		objset_t *os = zap->zap_objset;
+		uint64_t object = zap->zap_object;
+
+		zap_put_leaf(l);
+		zap_unlockdir(zap);
+		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+		ASSERT3U(err, ==, 0);
+		ASSERT(!zap->zap_ismicro);
+		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+
+		if (l->lh_prefix_len != old_prefix_len)
+			/* it split while our locks were down */
+			return (l);
+	}
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+		/* There's only one pointer to us. Chain on another leaf blk. */
+		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
+		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
+		    l->lh_prefix_len);
+		return (l);
+	}
+
+	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+	/* There's more than one pointer to us. Split this leaf. */
+	nl = zap_leaf_split(zap, l, tx);
+
+	/* set sibling pointers */
+	prefix_diff =
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
+		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
+	}
+
+	zap->zap_f.zap_phys->zap_num_leafs++;
+
+	if (hash & (1ULL << (64 - l->lh_prefix_len))) {
+		/* we want the sibling */
+		zap_put_leaf(l);
+		l = nl;
+	} else {
+		zap_put_leaf(nl);
+	}
+
+	return (l);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap,
+    zap_leaf_t *l, dmu_tx_t *tx)
+{
+	int shift, err;
+
+again:
+	shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+	if (l->lh_prefix_len == shift &&
+	    (l->l_next != NULL || l->lh_nfree < MIN_FREE)) {
+		/* this leaf will soon make us grow the pointer table */
+
+		if (zap_tryupgradedir(zap, tx) == 0) {
+			objset_t *os = zap->zap_objset;
+			uint64_t zapobj = zap->zap_object;
+			uint64_t blkid = l->l_blkid;
+
+			zap_put_leaf(l);
+			zap_unlockdir(zap);
+			err = zap_lockdir(os, zapobj, tx,
+			    RW_WRITER, FALSE, &zap);
+			ASSERT3U(err, ==, 0);
+			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+			goto again;
+		}
+
+		zap_put_leaf(l);
+		zap_grow_ptrtbl(zap, tx);
+	} else {
+		zap_put_leaf(l);
+	}
+}
+
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+	/* Only integer sizes supported by C */
+	switch (integer_size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/* Make sure we won't overflow */
+	if (integer_size * num_integers < num_integers)
+		return (EINVAL);
+	if (integer_size * num_integers > DMU_MAX_ACCESS)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+int
+fzap_lookup(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_leaf_t *l;
+	int err;
+	uint64_t hash;
+	zap_entry_handle_t zeh;
+
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err != 0)
+		goto out;
+	err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add_cd(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err;
+	zap_entry_handle_t zeh;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(!zap->zap_ismicro);
+	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err == 0) {
+		err = EEXIST;
+		goto out;
+	}
+	ASSERT(err == ENOENT);
+
+	/* XXX If this leaf is chained, split it if we can. */
+	err = zap_entry_create(l, name, hash, cd,
+	    integer_size, num_integers, val, &zeh);
+
+	if (err == 0) {
+		zap_increment_num_entries(zap, 1, tx);
+	} else if (err == EAGAIN) {
+		l = zap_expand_leaf(zap, l, hash, tx);
+		goto retry;
+	}
+
+out:
+	if (lp)
+		*lp = l;
+	else
+		zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	int err;
+	zap_leaf_t *l;
+
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	err = fzap_add_cd(zap, name, integer_size, num_integers,
+	    val, ZAP_MAXCD, tx, &l);
+
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	return (err);
+}
+
+int
+fzap_update(zap_t *zap, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err, create;
+	zap_entry_handle_t zeh;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	create = (err == ENOENT);
+	ASSERT(err == 0 || err == ENOENT);
+
+	/* XXX If this leaf is chained, split it if we can. */
+
+	if (create) {
+		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
+		    integer_size, num_integers, val, &zeh);
+		if (err == 0)
+			zap_increment_num_entries(zap, 1, tx);
+	} else {
+		err = zap_entry_update(&zeh, integer_size, num_integers, val);
+	}
+
+	if (err == EAGAIN) {
+		l = zap_expand_leaf(zap, l, hash, tx);
+		goto retry;
+	}
+
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	return (err);
+}
+
+int
+fzap_length(zap_t *zap, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_leaf_t *l;
+	int err;
+	uint64_t hash;
+	zap_entry_handle_t zeh;
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err != 0)
+		goto out;
+
+	if (integer_size)
+		*integer_size = zeh.zeh_integer_size;
+	if (num_integers)
+		*num_integers = zeh.zeh_num_integers;
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err;
+	zap_entry_handle_t zeh;
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err == 0) {
+		zap_entry_remove(&zeh);
+		zap_increment_num_entries(zap, -1, tx);
+	}
+	zap_put_leaf(l);
+	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
+	    zap->zap_objset, zap->zap_object, name, err);
+	return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+{
+	zap_cursor_t zc;
+	zap_attribute_t *za;
+	int err;
+
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, zapobj);
+	    (err = zap_cursor_retrieve(&zc, za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if (za->za_first_integer == value) {
+			(void) strcpy(name, za->za_name);
+			break;
+		}
+	}
+	kmem_free(za, sizeof (zap_attribute_t));
+	return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+	int err = ENOENT;
+	zap_entry_handle_t zeh;
+	zap_leaf_t *l;
+
+	/* retrieve the next entry at or after zc_hash/zc_cd */
+	/* if no entry, return ENOENT */
+
+again:
+	l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+	if (err == ENOENT) {
+		uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
+		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+		zc->zc_cd = 0;
+		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
+			zc->zc_hash = -1ULL;
+		} else {
+			zap_put_leaf(l);
+			goto again;
+		}
+	}
+
+	if (err == 0) {
+		zc->zc_hash = zeh.zeh_hash;
+		zc->zc_cd = zeh.zeh_cd;
+		za->za_integer_length = zeh.zeh_integer_size;
+		za->za_num_integers = zeh.zeh_num_integers;
+		if (zeh.zeh_num_integers == 0) {
+			za->za_first_integer = 0;
+		} else {
+			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+			ASSERT(err == 0 || err == EOVERFLOW);
+		}
+		err = zap_entry_read_name(&zeh,
+		    sizeof (za->za_name), za->za_name);
+		ASSERT(err == 0);
+	}
+	zap_put_leaf(l);
+	return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+	int i;
+	uint64_t lastblk = 0;
+
+	/*
+	 * NB: if a leaf has more pointers than an entire ptrtbl block
+	 * can hold, then it'll be accounted for more than once, since
+	 * we won't have lastblk.
+	 */
+	for (i = 0; i < len; i++) {
+		zap_leaf_t *l;
+
+		if (tbl[i] == lastblk)
+			continue;
+		lastblk = tbl[i];
+
+		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
+
+		zap_stats_leaf(zap, l, zs);
+		zap_put_leaf(l);
+	}
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+	zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT;
+	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		/* the ptrtbl is entirely in the header block. */
+		zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
+		    1 << ZAP_PTRTBL_MIN_SHIFT, zs);
+	} else {
+		int b;
+
+		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+			ZAP_BLOCK_SHIFT);
+
+		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+		    b++) {
+			dmu_buf_t *db;
+
+			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) <<
+			    ZAP_BLOCK_SHIFT);
+			dmu_buf_read(db);
+			zap_stats_ptrtbl(zap, db->db_data,
+			    1<<(ZAP_BLOCK_SHIFT-3), zs);
+			dmu_buf_rele(db);
+		}
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 0000000000..82b786d05a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,883 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+#define	CHAIN_END 0xffff /* end of the chunk chain */
+
+/* somewhat arbitrary, could go up to around 100k ... */
+#define	MAX_ARRAY_BYTES (8<<10)
+
+#define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * XXX This will >> by a negative number when
+ * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
+ */
+#define	LEAF_HASH(l, h) \
+	((ZAP_LEAF_HASH_NUMENTRIES-1) & \
+		((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
+
+#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+/* #define	MEMCHECK */
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+	char *cp = a;
+	char *cpend = cp + n;
+
+	while (cp < cpend)
+		*cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+	switch (len) {
+	case 1:
+		*(uint8_t *)addr = value;
+		return;
+	case 2:
+		*(uint16_t *)addr = value;
+		return;
+	case 4:
+		*(uint32_t *)addr = value;
+		return;
+	case 8:
+		*(uint64_t *)addr = value;
+		return;
+	}
+	ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+	switch (len) {
+	case 1:
+		return (*(uint8_t *)addr);
+	case 2:
+		return (*(uint16_t *)addr);
+	case 4:
+		return (*(uint32_t *)addr);
+	case 8:
+		return (*(uint64_t *)addr);
+	}
+	ASSERT(!"bad int len");
+	return (0xFEEDFACEDEADBEEF);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf)
+{
+	int i;
+
+	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
+	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
+	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
+	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
+	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
+	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
+	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
+	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
+
+	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
+		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le;
+
+		switch (buf->l_chunk[i].l_free.lf_type) {
+		case ZAP_LEAF_ENTRY:
+			le = &buf->l_chunk[i].l_entry;
+
+			le->le_type = BSWAP_8(le->le_type);
+			le->le_int_size = BSWAP_8(le->le_int_size);
+			le->le_next = BSWAP_16(le->le_next);
+			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+			le->le_name_length = BSWAP_16(le->le_name_length);
+			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+			le->le_value_length = BSWAP_16(le->le_value_length);
+			le->le_cd = BSWAP_32(le->le_cd);
+			le->le_hash = BSWAP_64(le->le_hash);
+			break;
+		case ZAP_LEAF_FREE:
+			buf->l_chunk[i].l_free.lf_type =
+			    BSWAP_8(buf->l_chunk[i].l_free.lf_type);
+			buf->l_chunk[i].l_free.lf_next =
+			    BSWAP_16(buf->l_chunk[i].l_free.lf_next);
+			break;
+		case ZAP_LEAF_ARRAY:
+			/* zap_leaf_array */
+			buf->l_chunk[i].l_array.la_type =
+			    BSWAP_8(buf->l_chunk[i].l_array.la_type);
+			buf->l_chunk[i].l_array.la_next =
+			    BSWAP_16(buf->l_chunk[i].l_array.la_next);
+			/* la_array doesn't need swapping */
+			break;
+		default:
+			ASSERT(!"bad leaf type");
+		}
+	}
+}
+
+void
+zap_leaf_init(zap_leaf_t *l)
+{
+	int i;
+
+	ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
+	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+	zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
+		l->l_phys->l_chunk[i].l_free.lf_next = i+1;
+	}
+	l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
+	l->lh_block_type = ZBT_LEAF;
+	l->lh_magic = ZAP_LEAF_MAGIC;
+	l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
+}
+
+zap_leaf_t *
+zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
+{
+	nl->lh_prefix = l->lh_prefix;
+	nl->lh_prefix_len = l->lh_prefix_len;
+	nl->l_next = l->l_next;
+	l->l_next = nl;
+	nl->lh_next = l->lh_next;
+	l->lh_next = nl->l_blkid;
+	return (nl);
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+	int chunk;
+
+	ASSERT(l->lh_nfree > 0);
+
+	chunk = l->l_phys->l_hdr.lh_freelist;
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
+
+	l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
+
+#ifdef MEMCHECK
+	zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
+	    sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+	l->lh_nfree--;
+
+	return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+	struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
+	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
+
+#ifdef MEMCHECK
+	zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
+	    sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+	zlf->lf_type = ZAP_LEAF_FREE;
+	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+	l->l_phys->l_hdr.lh_freelist = chunk;
+
+	l->lh_nfree++;
+}
+
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
+	int integer_size, int num_integers)
+{
+	uint16_t chunk_head;
+	uint16_t *chunkp = &chunk_head;
+	int byten = 0;
+	uint64_t value;
+	int shift = (integer_size-1)*8;
+	int len = num_integers;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+	while (len > 0) {
+		uint16_t chunk = zap_leaf_chunk_alloc(l);
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int i;
+
+		la->la_type = ZAP_LEAF_ARRAY;
+		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+			if (byten == 0)
+				value = ldv(integer_size, buf);
+			la->la_array[i] = (value & (0xff << shift)) >> shift;
+			value <<= 8;
+			if (++byten == integer_size) {
+				byten = 0;
+				buf += integer_size;
+				if (--len == 0)
+					break;
+			}
+		}
+
+		*chunkp = chunk;
+		chunkp = &la->la_next;
+	}
+	*chunkp = CHAIN_END;
+
+	return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
+{
+	uint16_t chunk = *chunkp;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	*chunkp = CHAIN_END;
+
+	while (chunk != CHAIN_END) {
+		int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
+		ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
+		    ZAP_LEAF_ARRAY);
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+	}
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
+    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+    char *buf)
+{
+	int len = MIN(array_len, buf_len);
+	int byten = 0;
+	uint64_t value = 0;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3U(array_int_len, <=, buf_int_len);
+
+	while (len > 0) {
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int i;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+			value = (value << 8) | la->la_array[i];
+			byten++;
+			if (byten == array_int_len) {
+				stv(buf_int_len, buf, value);
+				byten = 0;
+				len--;
+				if (len == 0)
+					return;
+				buf += buf_int_len;
+			}
+		}
+		chunk = la->la_next;
+	}
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
+    int array_len, const char *buf)
+{
+	int bseen = 0;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	while (bseen < array_len) {
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		if (bcmp(la->la_array, buf + bseen, toread))
+			break;
+		chunk = la->la_next;
+		bseen += toread;
+	}
+	return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l,
+    const char *name, uint64_t h, zap_entry_handle_t *zeh)
+{
+	uint16_t *chunkp;
+	struct zap_leaf_entry *le;
+
+	zeh->zeh_head_leaf = l;
+
+again:
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (chunkp = LEAF_HASH_ENTPTR(l, h);
+	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
+		uint16_t chunk = *chunkp;
+		le = &l->l_phys->l_chunk[chunk].l_entry;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+		if (le->le_hash != h)
+			continue;
+
+		zeh->zeh_found_leaf = l;
+		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
+		    le->le_name_length, name)) {
+			zeh->zeh_num_integers = le->le_value_length;
+			zeh->zeh_integer_size = le->le_int_size;
+			zeh->zeh_cd = le->le_cd;
+			zeh->zeh_hash = le->le_hash;
+			zeh->zeh_chunkp = chunkp;
+			zeh->zeh_found_leaf = l;
+			return (0);
+		}
+	}
+
+	if (l->l_next) {
+		l = l->l_next;
+		goto again;
+	}
+
+	return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+static int
+hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2)
+{
+	if (h1 > h2)
+		return (TRUE);
+	if (h1 == h2 && cd1 >= cd2)
+		return (TRUE);
+	return (FALSE);
+}
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+	uint16_t chunk;
+	uint64_t besth = -1ULL;
+	uint32_t bestcd = ZAP_MAXCD;
+	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
+	uint16_t lh;
+	struct zap_leaf_entry *le;
+
+	zeh->zeh_head_leaf = l;
+
+again:
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+		for (chunk = l->l_phys->l_hash[lh];
+		    chunk != CHAIN_END; chunk = le->le_next) {
+			le = &l->l_phys->l_chunk[chunk].l_entry;
+
+			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+			if (hcd_gteq(le->le_hash, le->le_cd, h, cd) &&
+			    hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) {
+				ASSERT3U(bestlh, >=, lh);
+				bestlh = lh;
+				besth = le->le_hash;
+				bestcd = le->le_cd;
+
+				zeh->zeh_num_integers = le->le_value_length;
+				zeh->zeh_integer_size = le->le_int_size;
+				zeh->zeh_cd = le->le_cd;
+				zeh->zeh_hash = le->le_hash;
+				zeh->zeh_fakechunk = chunk;
+				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+				zeh->zeh_found_leaf = l;
+			}
+		}
+	}
+
+	if (l->l_next) {
+		l = l->l_next;
+		goto again;
+	}
+
+	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+    uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+	struct zap_leaf_entry *le;
+
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	if (le->le_int_size > integer_size)
+		return (EINVAL);
+
+	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
+	    le->le_value_length, integer_size, num_integers, buf);
+
+	if (zeh->zeh_num_integers > num_integers)
+		return (EOVERFLOW);
+	return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+	struct zap_leaf_entry *le;
+
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
+	    le->le_name_length, 1, buflen, buf);
+	if (le->le_name_length > buflen)
+		return (EOVERFLOW);
+	return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+	int delta_chunks;
+	struct zap_leaf_entry *le;
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+
+	delta_chunks = NCHUNKS(num_integers * integer_size) -
+	    NCHUNKS(le->le_value_length * le->le_int_size);
+
+	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
+		return (EAGAIN);
+
+	/*
+	 * We should search other chained leaves (via
+	 * zap_entry_remove,create?) otherwise returning EAGAIN will
+	 * just send us into an infinite loop if we have to chain
+	 * another leaf block, rather than being able to split this
+	 * block.
+	 */
+
+	zap_leaf_array_free(zeh, &le->le_value_chunk);
+	le->le_value_chunk =
+	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+	    (MAX_ARRAY_BYTES + 1) : (num_integers);
+	le->le_int_size = integer_size;
+	return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+	uint16_t entry_chunk;
+	struct zap_leaf_entry *le;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+	entry_chunk = *zeh->zeh_chunkp;
+	le = &l->l_phys->l_chunk[entry_chunk].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	zap_leaf_array_free(zeh, &le->le_name_chunk);
+	zap_leaf_array_free(zeh, &le->le_value_chunk);
+
+	*zeh->zeh_chunkp = le->le_next;
+	zap_leaf_chunk_free(l, entry_chunk);
+
+	l->lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh)
+{
+	uint16_t chunk;
+	uint16_t *chunkp;
+	struct zap_leaf_entry *le;
+	uint64_t namelen, valuelen;
+	int numchunks;
+
+	valuelen = integer_size * num_integers;
+	namelen = strlen(name) + 1;
+	ASSERT(namelen >= 2);
+
+	zeh->zeh_head_leaf = l;
+
+	if (namelen > MAXNAMELEN)
+		return (ENAMETOOLONG);
+	/* find the first leaf in the chain that has sufficient free space */
+	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
+	if (numchunks > ZAP_LEAF_NUMCHUNKS)
+		return (E2BIG);
+
+	if (cd == ZAP_MAXCD) {
+		for (cd = 0; cd < ZAP_MAXCD; cd++) {
+			zap_leaf_t *ll;
+			for (ll = l; ll; ll = ll->l_next) {
+				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
+				    chunk != CHAIN_END; chunk = le->le_next) {
+					le = &ll->l_phys->l_chunk
+					    [chunk].l_entry;
+					if (le->le_hash == h &&
+					    le->le_cd == cd) {
+						break;
+					}
+				}
+				/*
+				 * if this cd is in use, no need to
+				 * check more chained leafs
+				 */
+				if (chunk != CHAIN_END)
+					break;
+			}
+			/* If this cd is not in use, we are good. */
+			if (chunk == CHAIN_END)
+				break;
+		}
+		/* If we tried all the cd's, we lose. */
+		if (cd == ZAP_MAXCD)
+			return (ENOSPC);
+	}
+
+	for (; l; l = l->l_next)
+		if (l->lh_nfree >= numchunks)
+			break;
+	if (l == NULL)
+		return (EAGAIN);
+
+	zeh->zeh_found_leaf = l;
+
+	/* make the entry */
+	chunk = zap_leaf_chunk_alloc(l);
+	le = &l->l_phys->l_chunk[chunk].l_entry;
+	le->le_type = ZAP_LEAF_ENTRY;
+	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
+	le->le_name_length = namelen;
+	le->le_value_chunk =
+	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+	    (MAX_ARRAY_BYTES + 1) : (num_integers);
+	le->le_int_size = integer_size;
+	le->le_hash = h;
+	le->le_cd = cd;
+
+	/* link it into the hash chain */
+	chunkp = LEAF_HASH_ENTPTR(l, h);
+	le->le_next = *chunkp;
+	*chunkp = chunk;
+
+	l->lh_nentries++;
+
+	zeh->zeh_num_integers = num_integers;
+	zeh->zeh_integer_size = le->le_int_size;
+	zeh->zeh_cd = le->le_cd;
+	zeh->zeh_hash = le->le_hash;
+	zeh->zeh_chunkp = chunkp;
+
+	return (0);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static void
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+	struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
+	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
+	le->le_next = *ptr;
+	*ptr = entry;
+}
+
+static void
+zap_leaf_rehash_entries(zap_leaf_t *l)
+{
+	int i;
+
+	if (l->lh_nentries == 0)
+		return;
+
+	/* break existing hash chains */
+	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+		if (le->le_type != ZAP_LEAF_ENTRY)
+			continue;
+		zap_leaf_rehash_entry(l, i);
+	}
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+	uint16_t new_chunk;
+	uint16_t *nchunkp = &new_chunk;
+
+	while (chunk != CHAIN_END) {
+		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+		struct zap_leaf_array *nla =
+		    &nl->l_phys->l_chunk[nchunk].l_array;
+		struct zap_leaf_array *la =
+		    &l->l_phys->l_chunk[chunk].l_array;
+		int nextchunk = la->la_next;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
+
+		*nla = *la;
+
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+		*nchunkp = nchunk;
+		nchunkp = &nla->la_next;
+	}
+	*nchunkp = CHAIN_END;
+	return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
+    dmu_tx_t *tx)
+{
+	zap_leaf_t *nl;
+	struct zap_leaf_entry *le, *nle;
+	uint16_t chunk, nchunks;
+
+	le = &l->l_phys->l_chunk[entry].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	/* find a leaf in the destination leaf chain with enough free space */
+	nchunks = 1 + NCHUNKS(le->le_name_length) +
+	    NCHUNKS(le->le_value_length * le->le_int_size);
+	for (nl = nhl; nl; nl = nl->l_next)
+		if (nl->lh_nfree >= nchunks)
+			break;
+	if (nl == NULL) {
+		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
+		dprintf("transfer_entry: chaining leaf %x/%d\n",
+		    nl->lh_prefix, nl->lh_prefix_len);
+	}
+
+	chunk = zap_leaf_chunk_alloc(nl);
+	nle = &nl->l_phys->l_chunk[chunk].l_entry;
+	*nle = *le;
+
+	zap_leaf_rehash_entry(nl, chunk);
+
+	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+	nle->le_value_chunk =
+	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+	zap_leaf_chunk_free(l, entry);
+
+	l->lh_nentries--;
+	nl->lh_nentries++;
+}
+
+/*
+ * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
+ * Ignore leaf chaining in source (l), but chain in destinations.
+ * We'll re-chain all the entries in l as we go along.
+ */
+static void
+zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
+    zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
+{
+	int i;
+
+	ASSERT(bit < 64 && bit >= 0);
+	/* break existing hash chains */
+	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+	if (nl0 != l)
+		zap_leaf_rehash_entries(nl0);
+	if (nl1 != nl0)
+		zap_leaf_rehash_entries(nl1);
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+		if (le->le_type != ZAP_LEAF_ENTRY)
+			continue;
+
+		/*
+		 * We could find entries via hashtable instead. That
+		 * would be O(hashents+numents) rather than
+		 * O(numblks+numents), but this accesses memory more
+		 * sequentially, and when we're called, the block is
+		 * usually pretty full.
+		 */
+
+		if (le->le_hash & (1ULL << bit)) {
+			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
+		} else {
+			if (nl0 == l)
+				zap_leaf_rehash_entry(l, i);
+			else
+				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
+		}
+	}
+
+}
+
+/*
+ * nl will contain the entries whose hash prefix ends in 1
+ * handles leaf chaining
+ */
+zap_leaf_t *
+zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
+{
+	zap_leaf_t *l = hl;
+	int bit = 64 - 1 - hl->lh_prefix_len;
+	zap_leaf_t *nl = zap_create_leaf(zap, tx);
+
+	/* set new prefix and prefix_len */
+	hl->lh_prefix <<= 1;
+	hl->lh_prefix_len++;
+	nl->lh_prefix = hl->lh_prefix | 1;
+	nl->lh_prefix_len = hl->lh_prefix_len;
+
+	/* transfer odd entries from first leaf in hl chain to nl */
+	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
+
+	/* take rest of chain off hl */
+	l = hl->l_next;
+	hl->l_next = NULL;
+	hl->lh_next = 0;
+
+	/* transfer even entries from hl chain back to hl, odd entries to nl */
+	while (l) {
+		zap_leaf_t *next = l->l_next;
+		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
+		zap_destroy_leaf(zap, l, tx);
+		l = next;
+	}
+
+	return (nl);
+}
+
+void
+zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+	int n, nchained = 0;
+
+	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_leafs_with_2n_pointers[n]++;
+
+	do {
+		int i;
+
+		n = l->lh_nentries/5;
+		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+		zs->zs_blocks_with_n5_entries[n]++;
+
+		n = ((1<<ZAP_BLOCK_SHIFT) -
+		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+		    (1<<ZAP_BLOCK_SHIFT);
+		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+		zs->zs_blocks_n_tenths_full[n]++;
+
+		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
+			int nentries = 0;
+			int chunk = l->l_phys->l_hash[i];
+
+			while (chunk != CHAIN_END) {
+				struct zap_leaf_entry *le =
+				    &l->l_phys->l_chunk[chunk].l_entry;
+
+				n = 1 + NCHUNKS(le->le_name_length) +
+				    NCHUNKS(le->le_value_length *
+					le->le_int_size);
+				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+				zs->zs_entries_using_n_chunks[n]++;
+
+				chunk = le->le_next;
+				nentries++;
+			}
+
+			n = nentries;
+			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+			zs->zs_buckets_with_n_entries[n]++;
+		}
+
+		nchained++;
+		l = l->l_next;
+	} while (l);
+
+	n = nchained-1;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_leafs_with_n_chained[n]++;
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 0000000000..998b67c50f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/avl.h>
+
+
+static uint64_t mzap_write_cookie(zap_t *zap, uint64_t cookie,
+    uint64_t entptr);
+static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+	int i, max;
+	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+	buf->mz_salt = BSWAP_64(buf->mz_salt);
+	max = (size / MZAP_ENT_LEN) - 1;
+	for (i = 0; i < max; i++) {
+		buf->mz_chunk[i].mze_value =
+		    BSWAP_64(buf->mz_chunk[i].mze_value);
+		buf->mz_chunk[i].mze_cd =
+		    BSWAP_32(buf->mz_chunk[i].mze_cd);
+	}
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+	uint64_t block_type;
+
+	block_type = *(uint64_t *)buf;
+
+	switch (block_type) {
+	case ZBT_MICRO:
+	case BSWAP_64(ZBT_MICRO):
+		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
+		mzap_byteswap(buf, size);
+		return;
+	default:
+		ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+		fzap_byteswap(buf, size);
+		return;
+	}
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+	const mzap_ent_t *mze1 = arg1;
+	const mzap_ent_t *mze2 = arg2;
+
+	if (mze1->mze_hash > mze2->mze_hash)
+		return (+1);
+	if (mze1->mze_hash < mze2->mze_hash)
+		return (-1);
+	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+		return (+1);
+	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+		return (-1);
+	return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+	mzap_ent_t *mze;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(mzep->mze_cd < ZAP_MAXCD);
+	ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
+
+	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+	mze->mze_chunkid = chunkid;
+	mze->mze_hash = hash;
+	mze->mze_phys = *mzep;
+	avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_t *zap, const char *name, uint64_t hash)
+{
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT3U(zap_hash(zap, name), ==, hash);
+
+	if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+		return (NULL);
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_phys.mze_cd = 0;
+
+	mze = avl_find(avl, &mze_tofind, &idx);
+	if (mze == NULL)
+		mze = avl_nearest(avl, idx, AVL_AFTER);
+	for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		if (strcmp(name, mze->mze_phys.mze_name) == 0)
+			return (mze);
+	}
+	return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	uint32_t cd;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_phys.mze_cd = 0;
+
+	cd = 0;
+	for (mze = avl_find(avl, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		if (mze->mze_phys.mze_cd != cd)
+			break;
+		cd++;
+	}
+
+	return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	avl_remove(&zap->zap_m.zap_avl, mze);
+	kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+	mzap_ent_t *mze;
+	void *avlcookie = NULL;
+
+	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+		kmem_free(mze, sizeof (mzap_ent_t));
+	avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+	zap_t *winner;
+	zap_t *zap;
+	int i;
+
+	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+	rw_init(&zap->zap_rwlock, 0, 0, 0);
+	rw_enter(&zap->zap_rwlock, RW_WRITER);
+	zap->zap_objset = os;
+	zap->zap_object = obj;
+	zap->zap_dbuf = db;
+
+	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+	} else {
+		zap->zap_ismicro = TRUE;
+	}
+
+	/*
+	 * Make sure that zap_ismicro is set before we let others see
+	 * it, because zap_lockdir() checks zap_ismicro without the lock
+	 * held.
+	 */
+	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_pageout);
+
+	if (winner != NULL) {
+		kmem_free(zap, sizeof (zap_t));
+		return (winner);
+	}
+
+	if (zap->zap_ismicro) {
+		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+		avl_create(&zap->zap_m.zap_avl, mze_compare,
+		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+			mzap_ent_phys_t *mze =
+			    &zap->zap_m.zap_phys->mz_chunk[i];
+			if (mze->mze_name[0]) {
+				zap->zap_m.zap_num_entries++;
+				mze_insert(zap, i,
+				    zap_hash(zap, mze->mze_name), mze);
+			}
+		}
+	} else {
+		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+	}
+	rw_exit(&zap->zap_rwlock);
+	return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, int fatreader, zap_t **zapp)
+{
+	zap_t *zap;
+	dmu_buf_t *db;
+	krw_t lt;
+	int err;
+
+	*zapp = NULL;
+
+	db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+	}
+#endif
+
+	/*
+	 * The zap can deal with EIO here, but its callers don't yet, so
+	 * spare them by doing a mustsucceed read.
+	 */
+	dmu_buf_read(db);
+
+	zap = dmu_buf_get_user(db);
+	if (zap == NULL)
+		zap = mzap_open(os, obj, db);
+
+	/*
+	 * We're checking zap_ismicro without the lock held, in order to
+	 * tell what type of lock we want.  Once we have some sort of
+	 * lock, see if it really is the right type.  In practice this
+	 * can only be different if it was upgraded from micro to fat,
+	 * and micro wanted WRITER but fat only needs READER.
+	 */
+	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+	rw_enter(&zap->zap_rwlock, lt);
+	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+		/* it was upgraded, now we only need reader */
+		ASSERT(lt == RW_WRITER);
+		ASSERT(RW_READER ==
+		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+		rw_downgrade(&zap->zap_rwlock);
+		lt = RW_READER;
+	}
+
+	zap->zap_objset = os;
+
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+
+	ASSERT3P(zap->zap_dbuf, ==, db);
+
+	ASSERT(!zap->zap_ismicro ||
+	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+	if (zap->zap_ismicro && tx &&
+	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+		if (newsz > MZAP_MAX_BLKSZ) {
+			dprintf("upgrading obj %llu: num_entries=%u\n",
+			    obj, zap->zap_m.zap_num_entries);
+			mzap_upgrade(zap, tx);
+			*zapp = zap;
+			return (0);
+		}
+		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+		ASSERT3U(err, ==, 0);
+		zap->zap_m.zap_num_chunks =
+		    db->db_size / MZAP_ENT_LEN - 1;
+	}
+
+	*zapp = zap;
+	return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+	rw_exit(&zap->zap_rwlock);
+	dmu_buf_rele(zap->zap_dbuf);
+}
+
+static void
+mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	mzap_phys_t *mzp;
+	int i, sz, nchunks, err;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	sz = zap->zap_dbuf->db_size;
+	mzp = kmem_alloc(sz, KM_SLEEP);
+	bcopy(zap->zap_dbuf->db_data, mzp, sz);
+	nchunks = zap->zap_m.zap_num_chunks;
+
+	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+	    1ULL << ZAP_BLOCK_SHIFT, 0, tx);
+	ASSERT(err == 0);
+
+	dprintf("upgrading obj=%llu with %u chunks\n",
+	    zap->zap_object, nchunks);
+	mze_destroy(zap);
+
+	fzap_upgrade(zap, tx);
+
+	for (i = 0; i < nchunks; i++) {
+		int err;
+		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+		if (mze->mze_name[0] == 0)
+			continue;
+		dprintf("adding %s=%llu\n",
+		    mze->mze_name, mze->mze_value);
+		err = fzap_add_cd(zap,
+		    mze->mze_name, 8, 1, &mze->mze_value,
+		    mze->mze_cd, tx, NULL);
+		ASSERT3U(err, ==, 0);
+	}
+	kmem_free(mzp, sz);
+}
+
+uint64_t
+zap_hash(zap_t *zap, const char *name)
+{
+	const uint8_t *cp;
+	uint8_t c;
+	uint64_t crc = zap->zap_salt;
+
+	ASSERT(crc != 0);
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+	/*
+	 * Only use 28 bits, since we need 4 bits in the cookie for the
+	 * collision differentiator.  We MUST use the high bits, since
+	 * those are the onces that we first pay attention to when
+	 * chosing the bucket.
+	 */
+	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+	return (crc);
+}
+
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	mzap_phys_t *zp;
+
+	db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+	}
+#endif
+
+	dmu_buf_will_dirty(db, tx);
+	zp = db->db_data;
+	zp->mz_block_type = ZBT_MICRO;
+	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+	ASSERT(zp->mz_salt != 0);
+	dmu_buf_rele(db);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int err;
+
+	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+	if (err != 0)
+		return (err);
+	mzap_create_impl(os, obj, tx);
+	return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+	mzap_create_impl(os, obj, tx);
+	return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+	/*
+	 * dmu_object_free will free the object number and free the
+	 * data.  Freeing the data will cause our pageout function to be
+	 * called, which will destroy our data (zap_leaf_t's and zap_t).
+	 */
+
+	return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_pageout(dmu_buf_t *db, void *vmzap)
+{
+	zap_t *zap = vmzap;
+
+	rw_destroy(&zap->zap_rwlock);
+
+	if (zap->zap_ismicro) {
+		mze_destroy(zap);
+	}
+
+	kmem_free(zap, sizeof (zap_t));
+}
+
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_count(zap, count);
+	} else {
+		*count = zap->zap_m.zap_num_entries;
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_lookup(zap, name,
+		    integer_size, num_integers, buf);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			err = ENOENT;
+		} else {
+			if (num_integers < 1)
+				err = EOVERFLOW;
+			else if (integer_size != 8)
+				err = EINVAL;
+			else
+				*(uint64_t *)buf = mze->mze_phys.mze_value;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_length(zap, name, integer_size, num_integers);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			err = ENOENT;
+		} else {
+			if (integer_size)
+				*integer_size = 8;
+			if (num_integers)
+				*num_integers = 1;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+static void
+mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+{
+	int i;
+	int start = zap->zap_m.zap_alloc_next;
+	uint32_t cd;
+
+	dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		ASSERT(strcmp(name, mze->mze_name) != 0);
+	}
+#endif
+
+	cd = mze_find_unused_cd(zap, hash);
+	/* given the limited size of the microzap, this can't happen */
+	ASSERT(cd != ZAP_MAXCD);
+
+again:
+	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		if (mze->mze_name[0] == 0) {
+			mze->mze_value = value;
+			mze->mze_cd = cd;
+			(void) strcpy(mze->mze_name, name);
+			zap->zap_m.zap_num_entries++;
+			zap->zap_m.zap_alloc_next = i+1;
+			if (zap->zap_m.zap_alloc_next ==
+			    zap->zap_m.zap_num_chunks)
+				zap->zap_m.zap_alloc_next = 0;
+			mze_insert(zap, i, hash, mze);
+			return;
+		}
+	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+	ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+	const uint64_t *intval = val;
+	uint64_t hash;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    zapobj, integer_size, num_integers, name);
+		mzap_upgrade(zap, tx);
+		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+	} else {
+		hash = zap_hash(zap, name);
+		mze = mze_find(zap, name, hash);
+		if (mze != NULL) {
+			err = EEXIST;
+		} else {
+			mzap_addent(zap, name, hash, *intval);
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	mzap_ent_t *mze;
+	const uint64_t *intval = val;
+	uint64_t hash;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	if (!zap->zap_ismicro) {
+		err = fzap_update(zap, name,
+		    integer_size, num_integers, val, tx);
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    zapobj, integer_size, num_integers, name);
+		mzap_upgrade(zap, tx);
+		err = fzap_update(zap, name,
+		    integer_size, num_integers, val, tx);
+	} else {
+		hash = zap_hash(zap, name);
+		mze = mze_find(zap, name, hash);
+		if (mze != NULL) {
+			mze->mze_phys.mze_value = *intval;
+			zap->zap_m.zap_phys->mz_chunk
+			    [mze->mze_chunkid].mze_value = *intval;
+		} else {
+			mzap_addent(zap, name, hash, *intval);
+		}
+	}
+	zap_unlockdir(zap);
+	return (0);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_remove(zap, name, tx);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			dprintf("fail: %s\n", name);
+			err = ENOENT;
+		} else {
+			dprintf("success: %s\n", name);
+			zap->zap_m.zap_num_entries--;
+			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+			    sizeof (mzap_ent_phys_t));
+			mze_remove(zap, mze);
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zc->zc_objset = os;
+	zc->zc_zapobj = zapobj;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
+}
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this.  So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zc->zc_objset = os;
+	zc->zc_zapobj = zapobj;
+	if (serialized == -1ULL) {
+		zc->zc_hash = -1ULL;
+		zc->zc_cd = 0;
+	} else {
+		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+		zc->zc_cd = serialized >> ZAP_HASHBITS;
+		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+			zc->zc_cd = 0;
+	}
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return (-1ULL);
+	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+	ASSERT(zc->zc_cd < ZAP_MAXCD);
+	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+	zap_t *zap;
+	int err;
+	avl_index_t idx;
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+
+	if (zc->zc_hash == -1ULL)
+		return (ENOENT);
+
+	err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+	    RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_cursor_retrieve(zap, zc, za);
+	} else {
+		err = ENOENT;
+
+		mze_tofind.mze_hash = zc->zc_hash;
+		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+		mze = avl_find(&zap->zap_m.zap_avl, &mze_tofind, &idx);
+		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
+		    &zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+		    sizeof (mze->mze_phys)));
+		if (mze == NULL)
+			mze = avl_nearest(&zap->zap_m.zap_avl, idx, AVL_AFTER);
+
+		if (mze) {
+			za->za_integer_length = 8;
+			za->za_num_integers = 1;
+			za->za_first_integer = mze->mze_phys.mze_value;
+			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
+			zc->zc_hash = mze->mze_hash;
+			zc->zc_cd = mze->mze_phys.mze_cd;
+			err = 0;
+		} else {
+			zc->zc_hash = -1ULL;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return;
+	zc->zc_cd++;
+	if (zc->zc_cd >= ZAP_MAXCD) {
+		zc->zc_cd = 0;
+		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+		if (zc->zc_hash == 0) /* EOF */
+			zc->zc_hash = -1ULL;
+	}
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+	int err;
+	zap_t *zap;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+
+	bzero(zs, sizeof (zap_stats_t));
+
+	if (zap->zap_ismicro) {
+		zs->zs_blocksize = zap->zap_dbuf->db_size;
+		zs->zs_num_entries = zap->zap_m.zap_num_entries;
+		zs->zs_num_blocks = 1;
+	} else {
+		fzap_get_stats(zap, zs);
+	}
+	zap_unlockdir(zap);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs.conf b/usr/src/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 0000000000..09881909b8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 0000000000..960de720d1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,1537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/fs/zfs.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <util/qsort.h>
+#include "fs/fs_subr.h"
+#include <acl/acl_common.h>
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+#define	SECURE_NO_INHERIT	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	OGE_PAD	6		/* traditional owner/group/everyone ACES */
+
+static int zfs_ace_can_use(znode_t *zp, ace_t *);
+
+static zfs_acl_t *
+zfs_acl_alloc(int slots)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	if (slots != 0) {
+		aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
+		aclp->z_acl_count = 0;
+		aclp->z_state = ACL_DATA_ALLOCED;
+	} else {
+		aclp->z_state = 0;
+	}
+	aclp->z_slots = slots;
+	return (aclp);
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	if (aclp->z_state == ACL_DATA_ALLOCED) {
+		kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
+	}
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static uint32_t
+zfs_v4_to_unix(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY))
+		new_mask |= S_IROTH;
+	if (access_mask & (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_ADD_FILE))
+		new_mask |= S_IWOTH;
+	if (access_mask & (ACE_EXECUTE|ACE_READ_NAMED_ATTRS))
+		new_mask |= S_IXOTH;
+
+	return (new_mask);
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & 01)
+		new_mask |= (ACE_EXECUTE);
+	if (access_mask & 02) {
+		new_mask |= (ACE_WRITE_DATA);
+	} if (access_mask & 04) {
+		new_mask |= ACE_READ_DATA;
+	}
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
+    uid_t uid, int entry_type)
+{
+	zacep->a_access_mask = access_mask;
+	zacep->a_type = access_type;
+	zacep->a_who = uid;
+	zacep->a_flags = entry_type;
+}
+
+static uint64_t
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+{
+	int 	i;
+	int	entry_type;
+	mode_t	mode = (zp->z_phys->zp_mode &
+	    (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+	mode_t	 seen = 0;
+	ace_t 	*acep;
+
+	for (i = 0, acep = aclp->z_acl;
+	    i != aclp->z_acl_count; i++, acep++) {
+		entry_type = (acep->a_flags & 0xf040);
+		if (entry_type == ACE_OWNER) {
+			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP) {
+			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((acep->a_access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		}
+	}
+	return (mode);
+}
+
+static zfs_acl_t *
+zfs_acl_node_read_internal(znode_t *zp)
+{
+	zfs_acl_t	*aclp;
+
+	aclp = zfs_acl_alloc(0);
+	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+	aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+
+	return (aclp);
+}
+
+/*
+ * Read an external acl object.
+ */
+zfs_acl_t *
+zfs_acl_node_read(znode_t *zp)
+{
+	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
+	zfs_acl_t	*aclp;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
+		return (zfs_acl_node_read_internal(zp));
+
+	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+
+	dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+
+	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+
+	return (aclp);
+}
+
+static boolean_t
+zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
+{
+	ace_t 	*acep;
+	int i;
+
+	*inherit = 0;
+
+	if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
+		return (B_FALSE);
+	}
+
+	for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
+
+		/*
+		 * first check type of entry
+		 */
+
+		switch (acep->a_flags & 0xf040) {
+		case ACE_OWNER:
+			acep->a_who = -1;
+			break;
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+		case ACE_IDENTIFIER_GROUP:
+			if (acep->a_flags & ACE_GROUP) {
+				acep->a_who = -1;
+			}
+			break;
+		case ACE_EVERYONE:
+			acep->a_who = -1;
+			break;
+		}
+
+		/*
+		 * next check inheritance level flags
+		 */
+
+		if (acep->a_type != ALLOW && acep->a_type != DENY)
+			return (B_FALSE);
+
+		/*
+		 * Only directories should have inheritance flags.
+		 */
+		if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
+			return (B_FALSE);
+		}
+
+		if (acep->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
+			*inherit = 1;
+
+		if (acep->a_flags &
+		    (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+			    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+				return (B_FALSE);
+			}
+		}
+	}
+
+	return (B_TRUE);
+}
+/*
+ * common code for setting acl's.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+{
+	int 		inherit = 0;
+	int		error;
+	znode_phys_t	*zphys = zp->z_phys;
+	zfs_znode_acl_t	*zacl = &zphys->zp_acl;
+	uint32_t	acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint64_t	aoid = zphys->zp_acl.z_acl_extern_obj;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (ihp)
+		inherit = *ihp;		/* already determined by caller */
+	else if (!zfs_acl_valid(zp, aclp->z_acl,
+	    aclp->z_acl_count, &inherit)) {
+		return (EINVAL);
+	}
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	/*
+	 * Will ACL fit internally?
+	 */
+	if (aclp->z_acl_count > ACE_SLOT_CNT) {
+		if (aoid == 0) {
+			aoid = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+		} else {
+			(void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
+			    acl_phys_size, 0, tx);
+		}
+		zphys->zp_acl.z_acl_extern_obj = aoid;
+		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+		dmu_write(zfsvfs->z_os, aoid, 0,
+		    acl_phys_size, aclp->z_acl, tx);
+	} else {
+		/*
+		 * Migrating back embedded?
+		 */
+		if (zphys->zp_acl.z_acl_extern_obj) {
+			error = dmu_object_free(zfsvfs->z_os,
+				zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+			if (error)
+				return (error);
+			zphys->zp_acl.z_acl_extern_obj = 0;
+		}
+		bcopy(aclp->z_acl, zacl->z_ace_data,
+		    aclp->z_acl_count * sizeof (ace_t));
+		zacl->z_acl_count = aclp->z_acl_count;
+	}
+	if (inherit)
+		zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
+	else
+		zp->z_phys->zp_flags &= ~ZFS_INHERIT_ACE;
+
+	zphys->zp_mode = zfs_mode_compute(zp, aclp);
+	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+	return (0);
+}
+
+/*
+ * Create space for slots_needed ACEs to be append
+ * to aclp.
+ */
+static void
+zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
+{
+	ace_t	*newacep;
+	ace_t	*oldaclp;
+	int	slot_cnt;
+	int 	slots_left = aclp->z_slots - aclp->z_acl_count;
+
+	if (aclp->z_state == ACL_DATA_ALLOCED)
+		ASSERT(aclp->z_slots >= aclp->z_acl_count);
+	if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
+		slot_cnt = aclp->z_slots +  1 + (slots_needed - slots_left);
+		newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
+		bcopy(aclp->z_acl, newacep,
+		    ZFS_ACL_SIZE(aclp->z_acl_count));
+		oldaclp = aclp->z_acl;
+		if (aclp->z_state == ACL_DATA_ALLOCED)
+			kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
+		aclp->z_acl = newacep;
+		aclp->z_slots = slot_cnt;
+		aclp->z_state = ACL_DATA_ALLOCED;
+	}
+}
+
+/*
+ * Remove "slot" ACE from aclp
+ */
+static void
+zfs_ace_remove(zfs_acl_t *aclp, int slot)
+{
+	if (aclp->z_acl_count > 1) {
+		(void) memmove(&aclp->z_acl[slot],
+		    &aclp->z_acl[slot +1], sizeof (ace_t) *
+		    (--aclp->z_acl_count - slot));
+	} else
+		aclp->z_acl_count--;
+}
+
+/*
+ * Update access mask for prepended ACE
+ *
+ * This applies the "groupmask" value for aclmode property.
+ */
+static void
+zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+{
+
+	int	rmask, wmask, xmask;
+	int	user_ace;
+
+	user_ace = (!(acep->a_flags &
+	    (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
+
+	if (user_ace && (acep->a_who == owner)) {
+		rmask = S_IRUSR;
+		wmask = S_IWUSR;
+		xmask = S_IXUSR;
+	} else {
+		rmask = S_IRGRP;
+		wmask = S_IWGRP;
+		xmask = S_IXGRP;
+	}
+
+	if (origacep->a_access_mask & ACE_READ_DATA) {
+		if (mode & rmask)
+			acep->a_access_mask &= ~ACE_READ_DATA;
+		else
+			acep->a_access_mask |= ACE_READ_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_WRITE_DATA) {
+		if (mode & wmask)
+			acep->a_access_mask &= ~ACE_WRITE_DATA;
+		else
+			acep->a_access_mask |= ACE_WRITE_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_APPEND_DATA) {
+		if (mode & wmask)
+			acep->a_access_mask &= ~ACE_APPEND_DATA;
+		else
+			acep->a_access_mask |= ACE_APPEND_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_EXECUTE) {
+		if (mode & xmask)
+			acep->a_access_mask &= ~ACE_EXECUTE;
+		else
+			acep->a_access_mask |= ACE_EXECUTE;
+	}
+}
+
+/*
+ * Apply mode to canonical six ACEs.
+ */
+static void
+zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
+{
+	int	cnt;
+	ace_t	*acep;
+
+	cnt = aclp->z_acl_count -1;
+	acep = aclp->z_acl;
+
+	/*
+	 * Fixup final ACEs to match the mode
+	 */
+
+	ASSERT(cnt >= 5);
+	adjust_ace_pair(&acep[cnt - 1], mode);	/* everyone@ */
+	adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3);	/* group@ */
+	adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6);	/* owner@ */
+}
+
+
+static int
+zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+{
+	return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
+	    ((acep->a_flags & 0xf040) == type));
+}
+
+/*
+ * Can prepended ACE be reused?
+ */
+static int
+zfs_reuse_deny(ace_t *acep, int i)
+{
+	int okay_masks;
+
+	if (i < 1)
+		return (B_FALSE);
+
+	if (acep[i-1].a_type != DENY)
+		return (B_FALSE);
+
+	if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+		return (B_FALSE);
+
+	okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+
+	if (acep[i-1].a_access_mask & ~okay_masks)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Create space to prepend an ACE
+ */
+static void
+zfs_acl_prepend(zfs_acl_t *aclp, int i)
+{
+	ace_t	*oldaclp = NULL;
+	ace_t	*to, *from;
+	int	slots_left = aclp->z_slots - aclp->z_acl_count;
+	int	oldslots;
+	int	need_free = 0;
+
+	if (aclp->z_state == ACL_DATA_ALLOCED)
+		ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+	if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
+
+		to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
+		    OGE_PAD), KM_SLEEP);
+		if (aclp->z_state == ACL_DATA_ALLOCED)
+			need_free++;
+		from = aclp->z_acl;
+		oldaclp = aclp->z_acl;
+		(void) memmove(to, from,
+		    sizeof (ace_t) * aclp->z_acl_count);
+		aclp->z_state = ACL_DATA_ALLOCED;
+	} else {
+		from = aclp->z_acl;
+		to = aclp->z_acl;
+	}
+
+
+	(void) memmove(&to[i + 1], &from[i],
+	    sizeof (ace_t) * (aclp->z_acl_count - i));
+
+	if (oldaclp) {
+		aclp->z_acl = to;
+		oldslots = aclp->z_slots;
+		aclp->z_slots = aclp->z_acl_count + OGE_PAD;
+		if (need_free)
+			kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+	}
+
+}
+
+/*
+ * Prepend deny ACE
+ */
+static void
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+    mode_t mode)
+{
+	ace_t	*acep;
+
+	zfs_acl_prepend(aclp, i);
+
+	acep = aclp->z_acl;
+	zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
+	    (acep[i + 1].a_flags & 0xf040));
+	zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
+	aclp->z_acl_count++;
+}
+
+/*
+ * Split an inherited ACE into inherit_only ACE
+ * and original ACE with inheritance flags stripped off.
+ */
+static void
+zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+{
+	ace_t *acep = aclp->z_acl;
+
+	zfs_acl_prepend(aclp, i);
+	acep = aclp->z_acl;
+	acep[i] = acep[i + 1];
+	acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
+	acep[i + 1].a_flags &= ~ALL_INHERIT;
+	aclp->z_acl_count++;
+}
+
+/*
+ * Are ACES started at index i, the canonical six ACES?
+ */
+static int
+zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+{
+	ace_t *acep = aclp->z_acl;
+
+	if ((zfs_acl_ace_match(&acep[i],
+	    DENY, ACE_OWNER, 0) &&
+	    zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
+	    OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
+	    DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
+	    ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+	    DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
+	    zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
+	    EVERYONE_ALLOW_MASK))) {
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Apply step 1g, to group entries
+ *
+ * Need to deal with corner case where group may have
+ * greater permissions than owner.  If so then limit
+ * group permissions, based on what extra permissions
+ * group has.
+ */
+static void
+zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+{
+	mode_t extramode = (mode >> 3) & 07;
+	mode_t ownermode = (mode >> 6);
+
+	if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+
+		extramode &= ~ownermode;
+
+		if (extramode) {
+			if (extramode & 04) {
+				acep[0].a_access_mask &= ~ACE_READ_DATA;
+				acep[1].a_access_mask &= ~ACE_READ_DATA;
+			}
+			if (extramode & 02) {
+				acep[0].a_access_mask &=
+				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+				acep[1].a_access_mask &=
+				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			}
+			if (extramode & 01) {
+				acep[0].a_access_mask &= ~ACE_EXECUTE;
+				acep[1].a_access_mask &= ~ACE_EXECUTE;
+			}
+		}
+	}
+}
+
+/*
+ * Apply the chmod algorithm as described
+ * in PSARC/2002/240
+ */
+static int
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
+    dmu_tx_t *tx)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t 		*acep;
+	int 		i;
+	int		error;
+	int 		entry_type;
+	int 		reuse_deny;
+	int 		need_canonical_six = 1;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+
+	i = 0;
+	while (i < aclp->z_acl_count) {
+		acep = aclp->z_acl;
+		entry_type = (acep[i].a_flags & 0xf040);
+
+		if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
+		    (acep[i].a_flags & ACE_INHERIT_ONLY_ACE)) {
+			i++;
+			continue;
+		}
+
+
+		if (zfsvfs->z_acl_mode == DISCARD) {
+			zfs_ace_remove(aclp, i);
+			continue;
+		}
+
+		/*
+		 * Need to split ace into two?
+		 */
+		if ((acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) &&
+		    (!(acep[i].a_flags & ACE_INHERIT_ONLY_ACE))) {
+			zfs_acl_split_ace(aclp, i);
+			i++;
+			continue;
+		}
+
+		if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+		    (entry_type == OWNING_GROUP)) {
+			acep[i].a_access_mask &= ~OGE_CLEAR;
+			i++;
+			continue;
+
+		} else {
+			if (acep[i].a_type == ALLOW) {
+
+				/*
+				 * Check preceding ACE if any, to see
+				 * if we need to prepend a DENY ACE.
+				 * This is only applicable when the acl_mode
+				 * property == groupmask.
+				 */
+				if (zfsvfs->z_acl_mode == GROUPMASK) {
+
+					reuse_deny = zfs_reuse_deny(acep, i);
+
+					if (reuse_deny == B_FALSE) {
+						zfs_acl_prepend_deny(zp, aclp,
+						    i, mode);
+						i++;
+						acep = aclp->z_acl;
+					} else {
+						zfs_acl_prepend_fixup(
+						    &acep[i - 1],
+						    &acep[i], mode,
+						    zp->z_phys->zp_uid);
+					}
+					zfs_fixup_group_entries(&acep[i - 1],
+					    mode);
+				}
+			}
+			i++;
+		}
+	}
+
+	/*
+	 * Check out last six aces, if we have six.
+	 */
+
+	if (aclp->z_acl_count >= 6) {
+		i = aclp->z_acl_count - 6;
+
+		if (zfs_have_canonical_six(aclp, i)) {
+			need_canonical_six = 0;
+		}
+	}
+
+	if (need_canonical_six) {
+
+		zfs_acl_append(aclp, 6);
+		i = aclp->z_acl_count;
+		acep = aclp->z_acl;
+		zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
+		zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+		zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
+		zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
+		zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
+		    DENY, -1, ACE_EVERYONE);
+		zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
+		    ALLOW, -1, ACE_EVERYONE);
+		aclp->z_acl_count += 6;
+	}
+
+	zfs_acl_fixup_canonical_six(aclp, mode);
+
+	zp->z_phys->zp_mode = mode;
+	error = zfs_aclset_common(zp, aclp, tx, NULL);
+	return (error);
+}
+
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+{
+	zfs_acl_t *aclp;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	mutex_enter(&zp->z_acl_lock);
+	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	mutex_exit(&zp->z_acl_lock);
+	zfs_acl_free(aclp);
+	return (error);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+{
+	if ((zfsvfs->z_acl_inherit == SECURE) &&
+	    acep->a_type == ALLOW)
+		acep->a_access_mask &= ~SECURE_NO_INHERIT;
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t 		*pacep;
+	ace_t		*acep;
+	int 		ace_cnt = 0;
+	int		pace_cnt;
+	int 		i, j;
+	zfs_acl_t	*aclp = NULL;
+
+	i = j = 0;
+	pace_cnt = paclp->z_acl_count;
+	pacep = paclp->z_acl;
+	if (zfsvfs->z_acl_inherit != DISCARD) {
+		for (i = 0; i != pace_cnt; i++) {
+
+			if (zfsvfs->z_acl_inherit == NOALLOW &&
+			    pacep[i].a_type == ALLOW)
+				continue;
+
+			if (zfs_ace_can_use(zp, &pacep[i])) {
+				ace_cnt++;
+				if (!(pacep[i].a_flags &
+				    ACE_NO_PROPAGATE_INHERIT_ACE))
+					ace_cnt++;
+			}
+		}
+	}
+
+	aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
+	if (ace_cnt && zfsvfs->z_acl_inherit != DISCARD) {
+		acep = aclp->z_acl;
+		pacep = paclp->z_acl;
+		for (i = 0; i != pace_cnt; i++) {
+
+			if (zfsvfs->z_acl_inherit == NOALLOW &&
+			    pacep[i].a_type == ALLOW)
+				continue;
+
+			if (zfs_ace_can_use(zp, &pacep[i])) {
+				/*
+				 * Now create entry for inherited ace
+				 */
+				acep[j] = pacep[i];
+
+				if (pacep[i].a_flags &
+				    ACE_NO_PROPAGATE_INHERIT_ACE) {
+					acep[j].a_flags &= ~ALL_INHERIT;
+					j++;
+					continue;
+				}
+
+				if (pacep[i].a_type != ALLOW &&
+				    pacep[i].a_type != DENY) {
+					zfs_securemode_update(zfsvfs, &acep[j]);
+					j++;
+					continue;
+				}
+
+				if (ZTOV(zp)->v_type != VDIR) {
+					acep[j].a_flags &= ~ALL_INHERIT;
+					zfs_securemode_update(zfsvfs, &acep[j]);
+					j++;
+					continue;
+				}
+
+				ASSERT(ZTOV(zp)->v_type == VDIR);
+
+				/*
+				 * If we are inheriting an ACE targeted for
+				 * only files, then leave the inherit_only
+				 * one for future propagation.
+				 */
+				if ((acep[j].a_flags & (ACE_FILE_INHERIT_ACE |
+				    ACE_DIRECTORY_INHERIT_ACE)) !=
+				    ACE_FILE_INHERIT_ACE)
+					acep[j].a_flags &=
+					    ~ACE_INHERIT_ONLY_ACE;
+
+				zfs_securemode_update(zfsvfs, &acep[j]);
+				j++;
+			}
+		}
+	}
+	aclp->z_acl_count = j;
+	ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+void
+zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
+    vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+{
+	uint64_t	mode;
+	uid_t		uid;
+	gid_t		gid;
+	int		error;
+	int		pull_down;
+	zfs_acl_t	*aclp, *paclp;
+
+	mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+		uid = vap->va_uid;
+		gid = vap->va_gid;
+	} else {
+		uid = crgetuid(cr);
+		if ((vap->va_mask & AT_GID) &&
+		    ((vap->va_gid == parent->z_phys->zp_gid) ||
+		    groupmember(vap->va_gid, cr) ||
+		    secpolicy_vnode_create_gid(cr)))
+			gid = vap->va_gid;
+		else
+			gid = (parent->z_phys->zp_mode & S_ISGID) ?
+			    parent->z_phys->zp_gid : crgetgid(cr);
+	}
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+		mode |= S_ISGID;
+	else {
+		if ((mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+			mode &= ~S_ISGID;
+	}
+
+	zp->z_phys->zp_uid = uid;
+	zp->z_phys->zp_gid = gid;
+	zp->z_phys->zp_mode = mode;
+
+	mutex_enter(&parent->z_lock);
+	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
+	if (pull_down) {
+		mutex_enter(&parent->z_acl_lock);
+		paclp = zfs_acl_node_read(parent);
+		mutex_exit(&parent->z_acl_lock);
+		aclp = zfs_acl_inherit(zp, paclp);
+		zfs_acl_free(paclp);
+	} else {
+		aclp = zfs_acl_alloc(6);
+	}
+	mutex_exit(&parent->z_lock);
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&zp->z_acl_lock);
+	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+	ASSERT3U(error, ==, 0);
+	zfs_acl_free(aclp);
+}
+
+/*
+ * Can use be used for inheritance
+ */
+static int
+zfs_ace_can_use(znode_t *zp, ace_t *acep)
+{
+	int vtype = ZTOV(zp)->v_type;
+
+	int	iflags = (acep->a_flags & 0xf);
+
+	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Retrieve a files ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	int		error;
+
+	if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
+		/*
+		 * If owner of file then allow reading of the
+		 * ACL.
+		 */
+		if (crgetuid(cr) != zp->z_phys->zp_uid)
+			return (error);
+	}
+
+	if (mask == 0)
+		return (ENOSYS);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	aclp = zfs_acl_node_read(zp);
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = aclp->z_acl_count;
+	}
+
+	if (mask & VSA_ACE) {
+		vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
+		    sizeof (ace_t), KM_SLEEP);
+		bcopy(aclp->z_acl, vsecp->vsa_aclentp,
+		    aclp->z_acl_count * sizeof (ace_t));
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	zfs_acl_free(aclp);
+
+	return (0);
+}
+
+/*
+ * Set a files ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	ace_t		*acep = vsecp->vsa_aclentp;
+	int		aclcnt = vsecp->vsa_aclcnt;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	int		inherit;
+	zfs_acl_t	*aclp;
+	uint64_t	seq = 0;
+
+	if (mask == 0)
+		return (EINVAL);
+
+	if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
+		return (EINVAL);
+top:
+	error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
+	if (error == EACCES || error == ACCESS_UNDETERMINED) {
+		if ((error = secpolicy_vnode_setdac(cr,
+		    zp->z_phys->zp_uid)) != 0) {
+			return (error);
+		}
+	} else if (error) {
+		return (error == EROFS ? error : EPERM);
+	}
+
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&zp->z_acl_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+
+	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+		dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
+		    0, ZFS_ACL_SIZE(aclcnt));
+	} else if (aclcnt > ACE_SLOT_CNT) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+	}
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+
+		mutex_exit(&zp->z_acl_lock);
+		mutex_exit(&zp->z_lock);
+
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		return (error);
+	}
+
+	aclp = zfs_acl_alloc(aclcnt);
+	bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
+	aclp->z_acl_count = aclcnt;
+	error = zfs_aclset_common(zp, aclp, tx, &inherit);
+	ASSERT(error == 0);
+
+	zfs_acl_free(aclp);
+	seq = zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
+	dmu_tx_commit(tx);
+done:
+	mutex_exit(&zp->z_acl_lock);
+	mutex_exit(&zp->z_lock);
+
+	zil_commit(zilog, seq, 0);
+
+	return (error);
+}
+
+static int
+zfs_ace_access(ace_t *zacep, int mode_wanted, int *working_mode)
+{
+	if ((*working_mode & mode_wanted) == mode_wanted) {
+		return (0);
+	}
+
+	if (zacep->a_access_mask & mode_wanted) {
+		if (zacep->a_type == ALLOW) {
+			*working_mode |= (mode_wanted & zacep->a_access_mask);
+			if ((*working_mode & mode_wanted) == mode_wanted)
+				return (0);
+		} else if (zacep->a_type == DENY) {
+			return (EACCES);
+		}
+	}
+
+	/*
+	 * haven't been specifcally denied at this point
+	 * so return UNDETERMINED.
+	 */
+
+	return (ACCESS_UNDETERMINED);
+}
+
+
+static int
+zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t		*zacep;
+	gid_t		gid;
+	int		mode_wanted = v4_mode;
+	int		cnt;
+	int		i;
+	int		access_deny = ACCESS_UNDETERMINED;
+	uint_t		entry_type;
+	uid_t		uid = crgetuid(cr);
+
+	*working_mode = 0;
+
+	if (zfsvfs->z_assign >= TXG_INITIAL)		/* ZIL replay */
+		return (0);
+
+	if ((v4_mode & WRITE_MASK) &&
+	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    (!IS_DEVVP(ZTOV(zp)))) {
+		return (EROFS);
+	}
+
+	mutex_enter(&zp->z_acl_lock);
+
+	aclp = zfs_acl_node_read(zp);
+
+	zacep = aclp->z_acl;
+	cnt = aclp->z_acl_count;
+
+	for (i = 0; i != cnt; i++) {
+
+		if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		entry_type = (zacep[i].a_flags & 0xf040);
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == zp->z_phys->zp_uid) {
+				access_deny = zfs_ace_access(&zacep[i],
+				    mode_wanted, working_mode);
+			}
+			break;
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+		case ACE_IDENTIFIER_GROUP:
+			/*
+			 * Owning group gid is in znode not ACL
+			 */
+			if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
+				gid = zp->z_phys->zp_gid;
+			else
+				gid = zacep[i].a_who;
+
+			if (groupmember(gid, cr)) {
+				access_deny = zfs_ace_access(&zacep[i],
+				    mode_wanted, working_mode);
+			}
+			break;
+		case ACE_EVERYONE:
+			access_deny = zfs_ace_access(&zacep[i],
+			    mode_wanted, working_mode);
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				if (uid == zacep[i].a_who) {
+					access_deny = zfs_ace_access(&zacep[i],
+					    mode_wanted, working_mode);
+				}
+				break;
+			}
+			zfs_acl_free(aclp);
+			mutex_exit(&zp->z_acl_lock);
+			return (EIO);
+		}
+
+		if (access_deny != ACCESS_UNDETERMINED)
+			break;
+
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+	zfs_acl_free(aclp);
+
+	return (access_deny);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied, invoking least
+ * priv subsytem when a deny is determined.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+{
+	int	working_mode = 0;
+	int	error;
+	int	is_attr;
+	znode_t	*xzp;
+	znode_t *check_zp = zp;
+
+	is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
+	    (ZTOV(zp)->v_type == VDIR));
+
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(zp->z_zfsvfs,
+		    zp->z_phys->zp_parent, &xzp)) != 0)	{
+			return (error);
+		}
+		check_zp = xzp;
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
+	error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+
+	if (error == EROFS) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (error);
+	}
+
+	if (error || (working_mode != mode)) {
+		error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+		    check_zp->z_phys->zp_uid, ~zfs_v4_to_unix(working_mode));
+	}
+
+	if (is_attr)
+		VN_RELE(ZTOV(xzp));
+
+	return (error);
+}
+
+/*
+ * Special zaccess function to check for special nfsv4 perm.
+ * doesn't call secpolicy_vnode_access() for failure, since that
+ * would probably be the wrong policy function to call.
+ * instead its up to the caller to handle that situation.
+ */
+
+int
+zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+{
+	int working_mode = 0;
+	return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+}
+
+/*
+ * Translate tradition unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, cr));
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |           Target Object Permissions |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      |  ACL Allows   | Permit     | Permit    | Permit     |
+ *      |  DELETE_CHILD |                                     |
+ *      -------------------------------------------------------
+ *      |  ACL Denies   | Permit     | Deny      | Deny       |
+ *      |  DELETE_CHILD |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Permit    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	int dzp_working_mode = 0;
+	int zp_working_mode = 0;
+	int dzp_error, zp_error;
+
+	/*
+	 * Arghh, this check is going to require a couple of questions
+	 * to be asked.  We want specific DELETE permissions to
+	 * take precedence over WRITE/EXECUTE.  We don't
+	 * want an ACL such as this to mess us up.
+	 * user:sloar:write_data:deny,user:sloar:delete:allow
+	 *
+	 * However, deny permissions may ultimately be overridden
+	 * by secpolicy_vnode_access().
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+	    &dzp_working_mode, cr);
+	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
+
+	if (dzp_error == EROFS || zp_error == EROFS)
+		return (dzp_error);
+
+	/*
+	 * First handle the first row
+	 */
+	if (dzp_working_mode & ACE_DELETE_CHILD)
+		return (0);
+
+	/*
+	 * Second row
+	 */
+
+	if (zp_working_mode & ACE_DELETE)
+		return (0);
+
+	/*
+	 * Third Row
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
+	    &dzp_working_mode, cr);
+
+	if (dzp_error == EROFS)
+		return (dzp_error);
+
+	if (dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE))
+		return (0);
+
+	/*
+	 * Fourth Row
+	 */
+
+	if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) &&
+	    (zp_working_mode & ACE_DELETE))
+		return (0);
+
+	return (secpolicy_vnode_access(cr, ZTOV(zp), dzp->z_phys->zp_uid,
+	    S_IWRITE|S_IEXEC));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	add_perm = (ZTOV(szp)->v_type == VDIR) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 */
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if (error = zfs_zaccess_delete(sdzp, szp, cr))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp) {
+		if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+			return (error);
+	}
+
+	/*
+	 * Now check for add permissions
+	 */
+	if (error = zfs_zaccess(sdzp, add_perm, cr))
+		return (error);
+
+	error = zfs_sticky_remove_access(sdzp, szp, cr);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_byteswap.c b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 0000000000..e1e857aa44
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+{
+	int i;
+
+	for (i = 0; i != ace_cnt; i++, ace++) {
+		ace->a_who = BSWAP_32(ace->a_who);
+		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+		ace->a_flags = BSWAP_16(ace->a_flags);
+		ace->a_type = BSWAP_16(ace->a_type);
+	}
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+	int cnt;
+
+	/*
+	 * Arggh, since we don't know how many ACEs are in
+	 * the array, we have to swap the entire block
+	 */
+
+	cnt = size / sizeof (ace_t);
+
+	zfs_ace_byteswap((ace_t *)buf, cnt);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+	znode_phys_t *zp = buf;
+
+	ASSERT(size >= sizeof (znode_phys_t));
+
+	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+	zp->zp_gen = BSWAP_64(zp->zp_gen);
+	zp->zp_mode = BSWAP_64(zp->zp_mode);
+	zp->zp_size = BSWAP_64(zp->zp_size);
+	zp->zp_parent = BSWAP_64(zp->zp_parent);
+	zp->zp_links = BSWAP_64(zp->zp_links);
+	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+	zp->zp_flags = BSWAP_64(zp->zp_flags);
+	zp->zp_uid = BSWAP_64(zp->zp_uid);
+	zp->zp_gid = BSWAP_64(zp->zp_gid);
+	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+	zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
+
+	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+	zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+	zp->zp_acl.z_acl_state = BSWAP_16(zp->zp_acl.z_acl_state);
+	zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 0000000000..229b042c4a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,936 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future.  The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ * 	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land.  The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ */
+
+#include <fs/fs_subr.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/gfs.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/mount.h>
+
+typedef struct {
+	char		*se_name;
+	vnode_t		*se_root;
+	avl_node_t	se_node;
+} zfs_snapentry_t;
+
+static int
+snapentry_compare(const void *a, const void *b)
+{
+	const zfs_snapentry_t *sa = a;
+	const zfs_snapentry_t *sb = b;
+	int ret = strcmp(sa->se_name, sb->se_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+vnodeops_t *zfsctl_ops_root;
+vnodeops_t *zfsctl_ops_snapdir;
+vnodeops_t *zfsctl_ops_snapshot;
+
+static const fs_operation_def_t zfsctl_tops_root[];
+static const fs_operation_def_t zfsctl_tops_snapdir[];
+static const fs_operation_def_t zfsctl_tops_snapshot[];
+
+static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
+
+static gfs_opsvec_t zfsctl_opsvec[] = {
+	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
+	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
+	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+	{ NULL }
+};
+
+typedef struct zfsctl_node {
+	gfs_dir_t	zc_gfs_private;
+	uint64_t	zc_id;
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+	zfsctl_node_t	sd_node;
+	kmutex_t	sd_lock;
+	avl_tree_t	sd_snaps;
+} zfsctl_snapdir_t;
+
+/*
+ * Root directory elements.  We have only a single static entry, 'snapshot'.
+ */
+static gfs_dirent_t zfsctl_root_entries[] = {
+	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+	{ NULL }
+};
+
+/* include . and .. in the calculation */
+#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
+    sizeof (gfs_dirent_t)) + 1)
+
+
+/*
+ * Initialize the various GFS pieces we'll need to create and manipulate .zfs
+ * directories.  This is called from the ZFS init routine, and initializes the
+ * vnode ops vectors that we'll be using.
+ */
+void
+zfsctl_init(void)
+{
+	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+}
+
+void
+zfsctl_fini(void)
+{
+	/*
+	 * Remove vfsctl vnode ops
+	 */
+	if (zfsctl_ops_root)
+		vn_freevnodeops(zfsctl_ops_root);
+	if (zfsctl_ops_snapdir)
+		vn_freevnodeops(zfsctl_ops_snapdir);
+	if (zfsctl_ops_snapshot)
+		vn_freevnodeops(zfsctl_ops_snapshot);
+
+	zfsctl_ops_root = NULL;
+	zfsctl_ops_snapdir = NULL;
+	zfsctl_ops_snapshot = NULL;
+}
+
+/*
+ * Return the inode number associated with the 'snapshot' directory.
+ */
+/* ARGSUSED */
+static ino64_t
+zfsctl_root_inode_cb(vnode_t *vp, int index)
+{
+	ASSERT(index == 0);
+	return (ZFSCTL_INO_SNAPDIR);
+}
+
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	vnode_t *vp;
+	zfsctl_node_t *zcp;
+
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
+	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
+	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
+	zcp = vp->v_data;
+	zcp->zc_id = ZFSCTL_INO_ROOT;
+
+	/*
+	 * We're only faking the fact that we have a root of a filesystem for
+	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
+	 * for us.
+	 */
+	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+
+	zfsvfs->z_ctldir = vp;
+}
+
+/*
+ * Destroy the '.zfs' directory.  Only called when the filesystem is
+ * unmounted, and there are no more references.  Release the vnode,
+ * which will release the hold on the vfs structure.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	ASSERT(zfsvfs->z_ctldir->v_count == 1);
+	VN_RELE(zfsvfs->z_ctldir);
+	zfsvfs->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+vnode_t *
+zfsctl_root(znode_t *zp)
+{
+	ASSERT(zfs_has_ctldir(zp));
+	VN_HOLD(zp->z_zfsvfs->z_ctldir);
+	return (zp->z_zfsvfs->z_ctldir);
+}
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
+{
+	if (flags & FWRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+/*
+ * Common close routine.  Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
+    cred_t *cr)
+{
+	return (0);
+}
+
+/*
+ * Common access routine.  Disallow writes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+	if (mode & VWRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+/*
+ * Common getattr function.  Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+	timestruc_t now;
+
+	vap->va_uid = 0;
+	vap->va_gid = 0;
+	vap->va_rdev = 0;
+	/*
+	 * We are a purly virtual object, so we have no
+	 * blocksize or allocated blocks.
+	 */
+	vap->va_blksize = 0;
+	vap->va_nblocks = 0;
+	vap->va_seq = 0;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+	    S_IROTH | S_IXOTH;
+	vap->va_type = VDIR;
+	/*
+	 * We live in the now.
+	 */
+	gethrestime(&now);
+	vap->va_mtime = vap->va_ctime = vap->va_atime = now;
+}
+
+static int
+zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_node_t	*zcp = vp->v_data;
+	uint64_t	object = zcp->zc_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (fidp->fid_len < SHORT_FID_LEN) {
+		fidp->fid_len = SHORT_FID_LEN;
+		return (ENOSPC);
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs znodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
+ */
+
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+	ZFS_ENTER(zfsvfs);
+	vap->va_nodeid = ZFSCTL_INO_ROOT;
+	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+
+	zfsctl_common_getattr(vp, vap);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	int err;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (strcmp(nm, "..") == 0) {
+		err = VFS_ROOT(dvp->v_vfsp, vpp);
+	} else {
+		err = gfs_dir_lookup(dvp, nm, vpp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (err);
+}
+
+static const fs_operation_def_t zfsctl_tops_root[] = {
+	{ VOPNAME_OPEN,		zfsctl_common_open			},
+	{ VOPNAME_CLOSE,	zfsctl_common_close			},
+	{ VOPNAME_IOCTL,	fs_inval				},
+	{ VOPNAME_GETATTR,	zfsctl_root_getattr			},
+	{ VOPNAME_ACCESS,	zfsctl_common_access			},
+	{ VOPNAME_READDIR,	gfs_vop_readdir				},
+	{ VOPNAME_LOOKUP,	zfsctl_root_lookup			},
+	{ VOPNAME_SEEK,		fs_seek					},
+	{ VOPNAME_INACTIVE,	(fs_generic_func_p) gfs_vop_inactive	},
+	{ VOPNAME_FID,		zfsctl_common_fid			},
+	{ NULL }
+};
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+	dmu_objset_name(os, zname);
+	(void) strcat(zname, "@");
+	if (strlen(zname) + strlen(name) >= len)
+		return (ENAMETOOLONG);
+	(void) strcat(zname, name);
+	return (0);
+}
+
+static int
+zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	zfs_snapentry_t search, *sep;
+	avl_index_t where;
+	int err;
+
+	ASSERT(MUTEX_HELD(&sdp->sd_lock));
+
+	search.se_name = (char *)name;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
+		return (ENOENT);
+
+	ASSERT(vn_ismntpt(sep->se_root));
+
+	/* this will be dropped by dounmount() */
+	if ((err = vn_vfswlock(sep->se_root)) != 0)
+		return (err);
+
+	VN_HOLD(sep->se_root);
+	if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0)
+		return (err);
+	ASSERT(sep->se_root->v_count == 1);
+	gfs_vop_inactive(sep->se_root, cr);
+
+	avl_remove(&sdp->sd_snaps, sep);
+	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+	kmem_free(sep, sizeof (zfs_snapentry_t));
+
+	return (0);
+}
+
+
+static int
+zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+{
+	avl_index_t where;
+	vfs_t *vfsp;
+	refstr_t *pathref;
+	char newpath[MAXNAMELEN];
+	const char *oldpath;
+	char *tail;
+	int err;
+
+	ASSERT(MUTEX_HELD(&sdp->sd_lock));
+	ASSERT(sep != NULL);
+
+	vfsp = vn_mountedvfs(sep->se_root);
+	ASSERT(vfsp != NULL);
+
+	if (err = vfs_lock(vfsp))
+		return (err);
+
+	/*
+	 * Change the name in the AVL tree.
+	 */
+	avl_remove(&sdp->sd_snaps, sep);
+	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+	(void) strcpy(sep->se_name, nm);
+	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
+	avl_insert(&sdp->sd_snaps, sep, where);
+
+	/*
+	 * Change the current mountpoint info:
+	 * 	- update the tail of the mntpoint path
+	 *	- update the tail of the resource path
+	 */
+	pathref = vfs_getmntpoint(vfsp);
+	oldpath = refstr_value(pathref);
+	VERIFY((tail = strrchr(oldpath, '/')) != NULL);
+	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
+	(void) strcat(newpath, nm);
+	refstr_rele(pathref);
+	vfs_setmntpoint(vfsp, newpath);
+
+	pathref = vfs_getresource(vfsp);
+	oldpath = refstr_value(pathref);
+	VERIFY((tail = strrchr(oldpath, '@')) != NULL);
+	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
+	(void) strcat(newpath, nm);
+	refstr_rele(pathref);
+	vfs_setresource(vfsp, newpath);
+
+	vfs_unlock(vfsp);
+	return (0);
+}
+
+static int
+zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
+    cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = sdvp->v_data;
+	zfs_snapentry_t search, *sep;
+	avl_index_t where;
+	char from[MAXNAMELEN], to[MAXNAMELEN];
+	int err;
+
+	VERIFY(zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from) == 0);
+	err = zfs_secpolicy_write(from, NULL, cr);
+	if (err)
+		return (err);
+
+	/*
+	 * Cannot move snapshots out of the snapdir.
+	 */
+	if (sdvp != tdvp)
+		return (EINVAL);
+
+	if (strcmp(snm, tnm) == 0)
+		return (0);
+
+	mutex_enter(&sdp->sd_lock);
+
+	search.se_name = (char *)snm;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+		err = zfsctl_rename_snap(sdp, sep, tnm);
+		if (err) {
+			mutex_exit(&sdp->sd_lock);
+			return (err);
+		}
+	}
+
+
+	VERIFY(zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to) == 0);
+	err = dmu_objset_rename(from, to);
+
+	mutex_exit(&sdp->sd_lock);
+
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	char snapname[MAXNAMELEN];
+	int err;
+
+	VERIFY(zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname) == 0);
+	err = zfs_secpolicy_write(snapname, NULL, cr);
+	if (err)
+		return (err);
+
+	mutex_enter(&sdp->sd_lock);
+
+	err = zfsctl_unmount_snap(dvp, name, 0, cr);
+	if (err) {
+		mutex_exit(&sdp->sd_lock);
+		return (err);
+	}
+
+	err = dmu_objset_destroy(snapname);
+
+	mutex_exit(&sdp->sd_lock);
+
+	return (err);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ */
+/* ARGSUSED */
+static int
+zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	objset_t *snap;
+	char snapname[MAXNAMELEN];
+	char *mountpoint;
+	zfs_snapentry_t *sep, search;
+	struct mounta margs;
+	vfs_t *vfsp;
+	size_t mountpoint_len;
+	avl_index_t where;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
+		return (0);
+
+	/*
+	 * If we get a recursive call, that means we got called
+	 * from the domount() code while it was trying to look up the
+	 * spec (which looks like a local path for zfs).  We need to
+	 * add some flag to domount() to tell it not to do this lookup.
+	 */
+	if (MUTEX_HELD(&sdp->sd_lock))
+		return (ENOENT);
+
+	ZFS_ENTER(zfsvfs);
+
+	mutex_enter(&sdp->sd_lock);
+	search.se_name = (char *)nm;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+		*vpp = sep->se_root;
+		VN_HOLD(*vpp);
+		/*
+		 * If the snapshot was unmounted behind our backs, remount it.
+		 */
+		if (!vn_ismntpt(*vpp))
+			goto domount;
+		VERIFY(traverse(vpp) == 0);
+		mutex_exit(&sdp->sd_lock);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * The requested snapshot is not currently mounted, look it up.
+	 */
+	VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
+	if (dmu_objset_open(snapname, DMU_OST_ZFS,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+		mutex_exit(&sdp->sd_lock);
+		ZFS_EXIT(zfsvfs);
+		return (ENOENT);
+	}
+
+	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+	(void) strcpy(sep->se_name, nm);
+	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
+	avl_insert(&sdp->sd_snaps, sep, where);
+
+	dmu_objset_close(snap);
+domount:
+	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
+	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
+	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
+	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
+
+	margs.spec = snapname;
+	margs.dir = mountpoint;
+	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
+	margs.fstype = "zfs";
+	margs.dataptr = NULL;
+	margs.datalen = 0;
+	margs.optptr = NULL;
+	margs.optlen = 0;
+
+	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
+	ASSERT3U(err, ==, 0);
+
+	kmem_free(mountpoint, mountpoint_len);
+
+	VFS_RELE(vfsp);
+
+	/*
+	 * Fix up the root vnode.
+	 */
+	VERIFY(traverse(vpp) == 0);
+	ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+	VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+	(*vpp)->v_vfsp = zfsvfs->z_vfs;
+	(*vpp)->v_flag &= ~VROOT;
+	mutex_exit(&sdp->sd_lock);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+    offset_t *offp, offset_t *nextp, void *data)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	char snapname[MAXNAMELEN];
+	uint64_t id, cookie;
+
+	ZFS_ENTER(zfsvfs);
+
+	cookie = *offp;
+	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+	    &cookie) == ENOENT) {
+		*eofp = 1;
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	(void) strcpy(dp->d_name, snapname);
+	dp->d_ino = ZFSCTL_INO_SNAP(id);
+	*nextp = cookie;
+
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+vnode_t *
+zfsctl_mknode_snapdir(vnode_t *pvp)
+{
+	vnode_t *vp;
+	zfsctl_snapdir_t *sdp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
+	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
+	    zfsctl_snapdir_readdir_cb, NULL);
+	sdp = vp->v_data;
+	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
+	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&sdp->sd_snaps, snapentry_compare,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+	return (vp);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_snapdir_t *sdp = vp->v_data;
+
+	ZFS_ENTER(zfsvfs);
+	zfsctl_common_getattr(vp, vap);
+	vap->va_nodeid = gfs_file_inode(vp);
+	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+static void
+zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = vp->v_data;
+
+	ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
+	mutex_destroy(&sdp->sd_lock);
+	avl_destroy(&sdp->sd_snaps);
+	gfs_vop_inactive(vp, cr);
+}
+
+static const fs_operation_def_t zfsctl_tops_snapdir[] = {
+	{ VOPNAME_OPEN,		zfsctl_common_open			},
+	{ VOPNAME_CLOSE,	zfsctl_common_close			},
+	{ VOPNAME_IOCTL,	fs_inval				},
+	{ VOPNAME_GETATTR,	zfsctl_snapdir_getattr			},
+	{ VOPNAME_ACCESS,	zfsctl_common_access			},
+	{ VOPNAME_RENAME,	zfsctl_snapdir_rename			},
+	{ VOPNAME_RMDIR,	zfsctl_snapdir_remove			},
+	{ VOPNAME_READDIR,	gfs_vop_readdir				},
+	{ VOPNAME_LOOKUP,	zfsctl_snapdir_lookup			},
+	{ VOPNAME_SEEK,		fs_seek					},
+	{ VOPNAME_INACTIVE,	(fs_generic_func_p) zfsctl_snapdir_inactive },
+	{ VOPNAME_FID,		zfsctl_common_fid			},
+	{ NULL }
+};
+
+static vnode_t *
+zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+{
+	vnode_t *vp;
+	zfsctl_node_t *zcp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
+	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+	zcp = vp->v_data;
+	zcp->zc_id = objset;
+
+	return (vp);
+}
+
+static void
+zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp;
+	zfs_snapentry_t *sep, *next;
+	vnode_t *dvp;
+
+	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+
+	if (vp->v_count > 1) {
+		mutex_exit(&sdp->sd_lock);
+		return;
+	}
+	ASSERT(!vn_ismntpt(vp));
+
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+		if (sep->se_root == vp) {
+			avl_remove(&sdp->sd_snaps, sep);
+			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+			kmem_free(sep, sizeof (zfs_snapentry_t));
+			break;
+		}
+		sep = next;
+	}
+	ASSERT(sep != NULL);
+
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	gfs_vop_inactive(vp, cr);
+}
+
+
+/*
+ * These VP's should never see the light of day.  They should always
+ * be covered.
+ */
+static const fs_operation_def_t zfsctl_tops_snapshot[] = {
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
+	NULL, NULL
+};
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	vnode_t *dvp, *vp;
+	zfsctl_snapdir_t *sdp;
+	zfsctl_node_t *zcp;
+	zfs_snapentry_t *sep;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+	    NULL, 0, NULL, kcred);
+	if (error != 0)
+		return (error);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		vp = sep->se_root;
+		zcp = vp->v_data;
+		if (zcp->zc_id == objsetid)
+			break;
+
+		sep = AVL_NEXT(&sdp->sd_snaps, sep);
+	}
+
+	if (sep != NULL) {
+		VN_HOLD(vp);
+		error = traverse(&vp);
+		if (error == 0)
+			*zfsvfsp = VTOZ(vp)->z_zfsvfs;
+		VN_RELE(vp);
+	} else {
+		error = EINVAL;
+	}
+
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	return (error);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem.  This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	vnode_t *dvp, *svp;
+	zfsctl_snapdir_t *sdp;
+	zfs_snapentry_t *sep, *next;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+	    NULL, 0, NULL, cr);
+	if (error != 0)
+		return (error);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		svp = sep->se_root;
+		next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+		/*
+		 * If this snapshot is not mounted, then it must
+		 * have just been unmounted by somebody else, and
+		 * will be cleaned up by zfsctl_snapdir_inactive().
+		 */
+		if (vn_ismntpt(svp)) {
+			if ((error = vn_vfswlock(svp)) != 0)
+				goto out;
+
+			VN_HOLD(svp);
+			error = dounmount(vn_mountedvfs(svp), fflags, cr);
+			if (error) {
+				VN_RELE(svp);
+				goto out;
+			}
+
+			avl_remove(&sdp->sd_snaps, sep);
+			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+			kmem_free(sep, sizeof (zfs_snapentry_t));
+
+			/*
+			 * We can't use VN_RELE(), as that will try to
+			 * invoke zfsctl_snapdir_inactive(), and that
+			 * would lead to an attempt to re-grab the sd_lock.
+			 */
+			ASSERT3U(svp->v_count, ==, 1);
+			gfs_vop_inactive(svp, cr);
+		}
+		sep = next;
+	}
+out:
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 0000000000..6df89ad0c4
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include "fs/fs_subr.h"
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Lock a directory entry.  A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object.  As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZSHARED: allow concurrent access with other ZSHARED callers.
+ *		  ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+	int flag)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	*dl;
+	uint64_t	zoid;
+	int		error;
+
+	*zpp = NULL;
+	*dlpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if (name[0] == '.' &&
+	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+		return (EEXIST);
+
+	/*
+	 * Wait until there are no locks on this name.
+	 */
+	mutex_enter(&dzp->z_lock);
+	for (;;) {
+		if (dzp->z_reap) {
+			mutex_exit(&dzp->z_lock);
+			return (ENOENT);
+		}
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
+			if (strcmp(name, dl->dl_name) == 0)
+				break;
+		if (dl == NULL)	{
+			/*
+			 * Allocate a new dirlock and add it to the list.
+			 */
+			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+			dl->dl_name = name;
+			dl->dl_sharecnt = 0;
+			dl->dl_namesize = 0;
+			dl->dl_dzp = dzp;
+			dl->dl_next = dzp->z_dirlocks;
+			dzp->z_dirlocks = dl;
+			break;
+		}
+		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+			break;
+		cv_wait(&dl->dl_cv, &dzp->z_lock);
+	}
+
+	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+		/*
+		 * We're the second shared reference to dl.  Make a copy of
+		 * dl_name in case the first thread goes away before we do.
+		 * Note that we initialize the new name before storing its
+		 * pointer into dl_name, because the first thread may load
+		 * dl->dl_name at any time.  He'll either see the old value,
+		 * which is his, or the new shared copy; either is OK.
+		 */
+		dl->dl_namesize = strlen(dl->dl_name) + 1;
+		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+		bcopy(dl->dl_name, name, dl->dl_namesize);
+		dl->dl_name = name;
+	}
+
+	mutex_exit(&dzp->z_lock);
+
+	/*
+	 * We have a dirlock on the name.  (Note that it is the dirlock,
+	 * not the dzp's z_lock, that protects the name in the zap object.)
+	 * See if there's an object by this name; if so, put a hold on it.
+	 */
+	if (flag & ZXATTR) {
+		zoid = dzp->z_phys->zp_xattr;
+		error = (zoid == 0 ? ENOENT : 0);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			zfs_dirent_unlock(dl);
+			return (EEXIST);
+		}
+		error = zfs_zget(zfsvfs, zoid, zpp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	}
+
+	*dlpp = dl;
+
+	return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfs_dirlock_t **prev_dl, *cur_dl;
+
+	mutex_enter(&dzp->z_lock);
+	if (dl->dl_sharecnt > 1) {
+		dl->dl_sharecnt--;
+		mutex_exit(&dzp->z_lock);
+		return;
+	}
+	prev_dl = &dzp->z_dirlocks;
+	while ((cur_dl = *prev_dl) != dl)
+		prev_dl = &cur_dl->dl_next;
+	*prev_dl = dl->dl_next;
+	cv_broadcast(&dl->dl_cv);
+	mutex_exit(&dzp->z_lock);
+
+	if (dl->dl_namesize != 0)
+		kmem_free(dl->dl_name, dl->dl_namesize);
+	cv_destroy(&dl->dl_cv);
+	kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ *	no directory entries are actually stored for them.  If this is
+ *	the root of a filesystem, then '.zfs' is also treated as a
+ *	special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+{
+	zfs_dirlock_t *dl;
+	znode_t *zp;
+	int error = 0;
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*vpp = ZTOV(dzp);
+		VN_HOLD(*vpp);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", vpp, NULL, 0, NULL, kcred);
+			return (error);
+		}
+		rw_enter(&dzp->z_parent_lock, RW_READER);
+		error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+		rw_exit(&dzp->z_parent_lock);
+	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+		*vpp = zfsctl_root(dzp);
+	} else {
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+		if (error == 0) {
+			*vpp = ZTOV(zp);
+			zfs_dirent_unlock(dl);
+		}
+	}
+
+	return (error);
+}
+
+static char *
+zfs_dq_hexname(char namebuf[17], uint64_t x)
+{
+	char *name = &namebuf[16];
+	const char digits[16] = "0123456789abcdef";
+
+	*name = '\0';
+	do {
+		*--name = digits[x & 0xf];
+		x >>= 4;
+	} while (x != 0);
+
+	return (name);
+}
+
+void
+zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	char obj_name[17];
+	int error;
+
+	ASSERT(zp->z_reap);
+	ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+	error = zap_add(zfsvfs->z_os, zfsvfs->z_dqueue,
+	    zfs_dq_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+	ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	dl;
+	int skipped = 0;
+	int error;
+
+	ASSERT(dzp->z_active == 0);
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &xzp);
+		ASSERT3U(error, ==, 0);
+
+		ASSERT((ZTOV(xzp)->v_type == VREG) ||
+		    (ZTOV(xzp)->v_type == VLNK));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_zap(tx, dzp->z_id, -1);
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			VN_RELE(ZTOV(xzp));
+			skipped += 1;
+			continue;
+		}
+		bzero(&dl, sizeof (dl));
+		dl.dl_dzp = dzp;
+		dl.dl_name = zap.za_name;
+
+		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		ASSERT3U(error, ==, 0);
+		dmu_tx_commit(tx);
+
+		VN_RELE(ZTOV(xzp));
+	}
+	ASSERT(error == ENOENT);
+	return (skipped);
+}
+
+/*
+ * Special function to requeue the znodes for deletion that were
+ * in progress when we either crashed or umounted the file system.
+ */
+static void
+zfs_drain_dq(zfsvfs_t *zfsvfs)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	/*
+	 * Interate over the contents of the delete queue.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_dqueue);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * Need some helpers?
+		 */
+		if (zfs_delete_thread_target(zfsvfs, -1) != 0)
+			return;
+
+		/*
+		 * See what kind of object we have in queue
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these queue entries for reaping,
+		 * so we pull them back into core and set zp->z_reap.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for reaping.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system to be processed by the taskq.
+		 */
+		if (error != 0) {
+			continue;
+		}
+		zp->z_reap = 1;
+		VN_RELE(ZTOV(zp));
+		break;
+	}
+}
+
+void
+zfs_delete_thread(void *arg)
+{
+	zfsvfs_t	*zfsvfs = arg;
+	zfs_delete_t 	*zd = &zfsvfs->z_delete_head;
+	znode_t		*zp;
+	callb_cpr_t	cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &zd->z_mutex, callb_generic_cpr, "zfs_delete");
+
+	mutex_enter(&zd->z_mutex);
+
+	if (!zd->z_drained && !zd->z_draining) {
+		zd->z_draining = B_TRUE;
+		mutex_exit(&zd->z_mutex);
+		zfs_drain_dq(zfsvfs);
+		mutex_enter(&zd->z_mutex);
+		zd->z_draining = B_FALSE;
+		zd->z_drained = B_TRUE;
+		cv_broadcast(&zd->z_quiesce_cv);
+	}
+
+	while (zd->z_thread_count <= zd->z_thread_target) {
+		zp = list_head(&zd->z_znodes);
+		if (zp == NULL) {
+			ASSERT(zd->z_znode_count == 0);
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&zd->z_cv, &zd->z_mutex);
+			CALLB_CPR_SAFE_END(&cprinfo, &zd->z_mutex);
+			continue;
+		}
+		ASSERT(zd->z_znode_count != 0);
+		list_remove(&zd->z_znodes, zp);
+		if (--zd->z_znode_count == 0)
+			cv_broadcast(&zd->z_quiesce_cv);
+		mutex_exit(&zd->z_mutex);
+		zfs_rmnode(zp);
+		(void) zfs_delete_thread_target(zfsvfs, -1);
+		mutex_enter(&zd->z_mutex);
+	}
+
+	ASSERT(zd->z_thread_count != 0);
+	if (--zd->z_thread_count == 0)
+		cv_broadcast(&zd->z_cv);
+
+	CALLB_CPR_EXIT(&cprinfo);	/* NB: drops z_mutex */
+	thread_exit();
+}
+
+static int zfs_work_per_thread_shift = 11;	/* 2048 (2^11) per thread */
+
+/*
+ * Set the target number of delete threads to 'nthreads'.
+ * If nthreads == -1, choose a number based on current workload.
+ * If nthreads == 0, don't return until the threads have exited.
+ */
+int
+zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads)
+{
+	zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+	mutex_enter(&zd->z_mutex);
+
+	if (nthreads == -1) {
+		if (zd->z_thread_target == 0) {
+			mutex_exit(&zd->z_mutex);
+			return (EBUSY);
+		}
+		nthreads = zd->z_znode_count >> zfs_work_per_thread_shift;
+		nthreads = MIN(nthreads, ncpus << 1);
+		nthreads = MAX(nthreads, 1);
+		nthreads += !!zd->z_draining;
+	}
+
+	zd->z_thread_target = nthreads;
+
+	while (zd->z_thread_count < zd->z_thread_target) {
+		(void) thread_create(NULL, 0, zfs_delete_thread, zfsvfs,
+		    0, &p0, TS_RUN, minclsyspri);
+		zd->z_thread_count++;
+	}
+
+	while (zd->z_thread_count > zd->z_thread_target && nthreads == 0) {
+		cv_broadcast(&zd->z_cv);
+		cv_wait(&zd->z_cv, &zd->z_mutex);
+	}
+
+	mutex_exit(&zd->z_mutex);
+
+	return (0);
+}
+
+/*
+ * Wait until everything that's been queued has been deleted.
+ */
+void
+zfs_delete_wait_empty(zfsvfs_t *zfsvfs)
+{
+	zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+	mutex_enter(&zd->z_mutex);
+	ASSERT(zd->z_thread_target != 0);
+	while (!zd->z_drained || zd->z_znode_count != 0) {
+		ASSERT(zd->z_thread_target != 0);
+		cv_wait(&zd->z_quiesce_cv, &zd->z_mutex);
+	}
+	mutex_exit(&zd->z_mutex);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	znode_t		*xzp = NULL;
+	char		obj_name[17];
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	int		error;
+
+	ASSERT(zp->z_active == 0);
+	ASSERT(ZTOV(zp)->v_count == 0);
+	ASSERT(zp->z_phys->zp_links == 0);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR))
+		if (zfs_purgedir(zp) != 0) {
+			zfs_delete_t *delq = &zfsvfs->z_delete_head;
+			/*
+			 * Add this back to the delete list to be retried later.
+			 *
+			 * XXX - this could just busy loop on us...
+			 */
+			mutex_enter(&delq->z_mutex);
+			list_insert_tail(&delq->z_znodes, zp);
+			delq->z_znode_count++;
+			mutex_exit(&delq->z_mutex);
+			return;
+		}
+
+	/*
+	 * If the file has extended attributes, unlink the xattr dir.
+	 */
+	if (zp->z_phys->zp_xattr) {
+		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+		ASSERT(error == 0);
+	}
+
+	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+	/*
+	 * Set up the transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	if (xzp) {
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	}
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_delete_t *delq = &zfsvfs->z_delete_head;
+
+		dmu_tx_abort(tx);
+		/*
+		 * Add this back to the delete list to be retried later.
+		 *
+		 * XXX - this could just busy loop on us...
+		 */
+		mutex_enter(&delq->z_mutex);
+		list_insert_tail(&delq->z_znodes, zp);
+		delq->z_znode_count++;
+		mutex_exit(&delq->z_mutex);
+		return;
+	}
+
+	if (xzp) {
+		dmu_buf_will_dirty(xzp->z_dbuf, tx);
+		mutex_enter(&xzp->z_lock);
+		xzp->z_reap = 1;		/* mark xzp for deletion */
+		xzp->z_phys->zp_links = 0;	/* no more links to it */
+		mutex_exit(&xzp->z_lock);
+		zfs_dq_add(xzp, tx);		/* add xzp to delete queue */
+	}
+
+	/*
+	 * Remove this znode from delete queue
+	 */
+	error = zap_remove(os, zfsvfs->z_dqueue,
+	    zfs_dq_hexname(obj_name, zp->z_id), tx);
+	ASSERT3U(error, ==, 0);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+
+	if (xzp)
+		VN_RELE(ZTOV(xzp));
+}
+
+/*
+ * Link zp into dl.  Can only fail if zp has been reaped.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	int error;
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	mutex_enter(&zp->z_lock);
+
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_reap) {	/* no new links to reaped zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			mutex_exit(&zp->z_lock);
+			return (ENOENT);
+		}
+		zp->z_phys->zp_links++;
+	}
+	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
+
+	if (!(flag & ZNEW))
+		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	mutex_exit(&zp->z_lock);
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size++;			/* one dirent added */
+	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	    8, 1, &zp->z_id, tx);
+	ASSERT(error == 0);
+
+	return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for reaping if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'reaped_ptr' is NULL, we put reaped znodes on the delete queue.
+ * If it's non-NULL, we use it to indicate whether the znode needs reaping,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+	int *reaped_ptr)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	int reaped = 0;
+	int error;
+
+	if (!(flag & ZRENAMING)) {
+		dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
+			return (EBUSY);
+
+		if (vn_ismntpt(vp)) {		/* don't remove mount point */
+			vn_vfsunlock(vp);
+			return (EBUSY);
+		}
+
+		mutex_enter(&zp->z_lock);
+		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
+			mutex_exit(&zp->z_lock);
+			vn_vfsunlock(vp);
+			return (EEXIST);
+		}
+		ASSERT(zp->z_phys->zp_links > zp_is_dir);
+		if (--zp->z_phys->zp_links == zp_is_dir) {
+			zp->z_reap = 1;
+			zp->z_phys->zp_links = 0;
+			reaped = 1;
+		} else {
+			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+		}
+		mutex_exit(&zp->z_lock);
+		vn_vfsunlock(vp);
+	}
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size--;			/* one dirent removed */
+	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+	ASSERT(error == 0);
+
+	if (reaped_ptr != NULL)
+		*reaped_ptr = reaped;
+	else if (reaped)
+		zfs_dq_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.  Works with or without z_lock
+ * held, but can only be consider a hint in the latter case.  Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	uint64_t xoid;
+	int error;
+
+	*xvpp = NULL;
+
+	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+		return (error);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
+	ASSERT(xzp->z_id == xoid);
+	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	zp->z_phys->zp_xattr = xoid;
+
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+	dmu_tx_commit(tx);
+
+	*xvpp = ZTOV(xzp);
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *
+ *	OUT:	xzpp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	znode_t		*xzp;
+	zfs_dirlock_t	*dl;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xvpp = ZTOV(xzp);
+		zfs_dirent_unlock(dl);
+		return (0);
+	}
+
+	ASSERT(zp->z_phys->zp_xattr == 0);
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		zfs_dirent_unlock(dl);
+		return (EROFS);
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+	va.va_type = VDIR;
+	va.va_mode = S_IFDIR | 0755;
+	va.va_uid = (uid_t)zp->z_phys->zp_uid;
+	va.va_gid = (gid_t)zp->z_phys->zp_gid;
+
+	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+	zfs_dirent_unlock(dl);
+
+	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+		goto top;
+	}
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	the entry is a plain file and you have write access,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t  		uid;
+
+	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+		return (0);
+
+	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
+	    (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
+	    uid == zp->z_phys->zp_uid ||
+	    (ZTOV(zp)->v_type == VREG &&
+	    zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+		return (0);
+	else
+		return (secpolicy_vnode_remove(cr));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 0000000000..e8723ffe89
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,1323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/mount.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+
+#include "zfs_namecheck.h"
+
+extern struct modlfs zfs_modlfs;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+ldi_ident_t zfs_li = NULL;
+dev_info_t *zfs_dip;
+
+typedef int zfs_ioc_func_t(zfs_cmd_t *);
+typedef int zfs_secpolicy_func_t(const char *, const char *, cred_t *);
+
+typedef struct zfs_ioc_vec {
+	zfs_ioc_func_t		*zvec_func;
+	zfs_secpolicy_func_t	*zvec_secpolicy;
+	enum {
+		no_name,
+		pool_name,
+		dataset_name
+	}			zvec_namecheck;
+} zfs_ioc_vec_t;
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	char buf[256];
+	va_list adx;
+
+	/*
+	 * Get rid of annoying "../common/" prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	/*
+	 * To get this data, use the zfs-dprintf probe as so:
+	 * dtrace -q -n 'zfs-dprintf \
+	 *	/stringof(arg0) == "dbuf.c"/ \
+	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
+	 * arg0 = file name
+	 * arg1 = function name
+	 * arg2 = line number
+	 * arg3 = message
+	 */
+	DTRACE_PROBE4(zfs__dprintf,
+	    char *, newfile, char *, func, int, line, char *, buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools).  Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(const char *unused1, const char *unused2, cred_t *cr)
+{
+	return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics).  Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(const char *dataset, const char *unused, cred_t *cr)
+{
+	if (INGLOBALZONE(curproc) ||
+	    zone_dataset_visible(dataset, NULL))
+		return (0);
+
+	return (ENOENT);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+	uint64_t zoned;
+	int writable = 1;
+
+	/*
+	 * The dataset must be visible by this zone -- check this first
+	 * so they don't see EPERM on something they shouldn't know about.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    !zone_dataset_visible(dataset, &writable))
+		return (ENOENT);
+
+	if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
+		return (ENOENT);
+
+	if (INGLOBALZONE(curproc)) {
+		/*
+		 * If the fs is zoned, only root can access it from the
+		 * global zone.
+		 */
+		if (secpolicy_zfs(cr) && zoned)
+			return (EPERM);
+	} else {
+		/*
+		 * If we are in a local zone, the 'zoned' property must be set.
+		 */
+		if (!zoned)
+			return (EPERM);
+
+		/* must be writable by this zone */
+		if (!writable)
+			return (EPERM);
+	}
+	return (0);
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+/* ARGSUSED */
+int
+zfs_secpolicy_write(const char *dataset, const char *unused, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for operations that want to write a dataset's parent:
+ * create, destroy, snapshot, clone, restore.
+ */
+static int
+zfs_secpolicy_parent(const char *dataset, const char *unused, cred_t *cr)
+{
+	char parentname[MAXNAMELEN];
+	char *cp;
+
+	/*
+	 * Remove the @bla or /bla from the end of the name to get the parent.
+	 */
+	(void) strncpy(parentname, dataset, sizeof (parentname));
+	cp = strrchr(parentname, '@');
+	if (cp != NULL) {
+		cp[0] = '\0';
+	} else {
+		cp = strrchr(parentname, '/');
+		if (cp == NULL)
+			return (ENOENT);
+		cp[0] = '\0';
+
+	}
+
+	return (zfs_secpolicy_write(parentname, unused, cr));
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+static int
+zfs_secpolicy_setprop(const char *dataset, const char *prop, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	if (strcmp(prop, "zoned") == 0) {
+		/*
+		 * Disallow setting of 'zoned' from within a local zone.
+		 */
+		if (!INGLOBALZONE(curproc))
+			return (EPERM);
+	}
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Security policy for setting the quota.  This is the same as
+ * zfs_secpolicy_write, except that the local zone may not change the quota at
+ * the zone-property setpoint.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_quota(const char *dataset, const char *unused, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	if (!INGLOBALZONE(curproc)) {
+		uint64_t zoned;
+		char setpoint[MAXNAMELEN];
+		int dslen;
+		/*
+		 * Unprivileged users are allowed to modify the quota
+		 * on things *under* (ie. contained by) the thing they
+		 * own.
+		 */
+		if (dsl_prop_get_integer(dataset, "zoned", &zoned, setpoint))
+			return (EPERM);
+		if (!zoned) /* this shouldn't happen */
+			return (EPERM);
+		dslen = strlen(dataset);
+		if (dslen <= strlen(setpoint))
+			return (EPERM);
+	}
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
+{
+	if (secpolicy_sys_config(cr, B_FALSE) != 0)
+		return (EPERM);
+
+	return (0);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_config(zfs_cmd_t *zc, nvlist_t **nvp)
+{
+	char *packed;
+	size_t size;
+	int error;
+	nvlist_t *config = NULL;
+
+	/*
+	 * Read in and unpack the user-supplied nvlist.  By this point, we know
+	 * that the user has the SYS_CONFIG privilege, so allocating arbitrary
+	 * sized regions of memory should not be a problem.
+	 */
+	if ((size = zc->zc_config_src_size) == 0)
+		return (EINVAL);
+
+	packed = kmem_alloc(size, KM_SLEEP);
+
+	if ((error = xcopyin((void *)(uintptr_t)zc->zc_config_src, packed,
+	    size)) != 0) {
+		kmem_free(packed, size);
+		return (error);
+	}
+
+	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+		kmem_free(packed, size);
+		return (error);
+	}
+
+	kmem_free(packed, size);
+
+	*nvp = config;
+	return (0);
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *config;
+
+	if ((error = get_config(zc, &config)) != 0)
+		return (error);
+
+	error = spa_create(zc->zc_name, config, zc->zc_root[0] == '\0' ?
+	    NULL : zc->zc_root);
+
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+	return (spa_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *config;
+	uint64_t guid;
+
+	if ((error = get_config(zc, &config)) != 0)
+		return (error);
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+	    guid != zc->zc_pool_guid)
+		error = EINVAL;
+	else
+		error = spa_import(zc->zc_name, config,
+		    zc->zc_root[0] == '\0' ? NULL : zc->zc_root);
+
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+	return (spa_export(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+	nvlist_t *configs;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+		return (EEXIST);
+
+	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+	if (size > zc->zc_config_dst_size)
+		error = ENOMEM;
+	else
+		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size);
+
+	zc->zc_config_dst_size = size;
+
+	kmem_free(packed, size);
+	nvlist_free(configs);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_guid(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		zc->zc_pool_guid = spa_guid(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+	nvlist_t *config;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	error = spa_get_stats(zc->zc_name, &config);
+
+	if (config != NULL) {
+		VERIFY(nvlist_pack(config, &packed, &size,
+		    NV_ENCODE_NATIVE, 0) == 0);
+
+		if (size > zc->zc_config_dst_size)
+			error = ENOMEM;
+		else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size))
+			error = EFAULT;
+
+		zc->zc_config_dst_size = size;
+
+		kmem_free(packed, size);
+		nvlist_free(config);
+	} else {
+		ASSERT(error != 0);
+	}
+
+	return (error);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+	nvlist_t *tryconfig, *config;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	if ((error = get_config(zc, &tryconfig)) != 0)
+		return (error);
+
+	config = spa_tryimport(tryconfig);
+
+	nvlist_free(tryconfig);
+
+	if (config == NULL)
+		return (EINVAL);
+
+	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+	if (size > zc->zc_config_dst_size)
+		error = ENOMEM;
+	else
+		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size);
+
+	zc->zc_config_dst_size = size;
+
+	kmem_free(packed, size);
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		spa_freeze(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	nvlist_t *config;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	if ((error = get_config(zc, &config)) == 0) {
+		error = spa_vdev_add(spa, config);
+		nvlist_free(config);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+	return (ENOTSUP);
+}
+
+static int
+zfs_ioc_vdev_online(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = vdev_online(spa, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_offline(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = vdev_offline(spa, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int replacing = zc->zc_cookie;
+	nvlist_t *config;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	if ((error = get_config(zc, &config)) == 0) {
+		error = spa_vdev_attach(spa, path, config, replacing);
+		nvlist_free(config);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_detach(spa, path, 0, B_FALSE);
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_get_stats(zfs_cmd_t *zc)
+{
+	char *name = zc->zc_name;
+	zfs_stats_t *zs = &zc->zc_zfs_stats;
+	int error;
+
+	bzero(zs, sizeof (zfs_stats_t));
+
+	if ((error = dsl_prop_get_integer(name, "atime",
+	    &zs->zs_atime, zs->zs_atime_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "recordsize",
+	    &zs->zs_recordsize, zs->zs_recordsize_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "readonly",
+	    &zs->zs_readonly, zs->zs_readonly_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "devices",
+	    &zs->zs_devices, zs->zs_devices_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "setuid",
+	    &zs->zs_setuid, zs->zs_setuid_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "exec",
+	    &zs->zs_exec, zs->zs_exec_setpoint)) != 0 ||
+	    (error = dsl_prop_get_string(name, "mountpoint", zs->zs_mountpoint,
+	    sizeof (zs->zs_mountpoint), zs->zs_mountpoint_setpoint)) != 0 ||
+	    (error = dsl_prop_get_string(name, "sharenfs", zs->zs_sharenfs,
+	    sizeof (zs->zs_sharenfs), zs->zs_sharenfs_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "aclmode",
+	    &zs->zs_acl_mode, zs->zs_acl_mode_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "snapdir",
+	    &zs->zs_snapdir, zs->zs_snapdir_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "aclinherit",
+	    &zs->zs_acl_inherit, zs->zs_acl_inherit_setpoint)) != 0)
+		return (error);
+
+	return (0);
+}
+
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+	objset_t *os = NULL;
+	int error;
+
+retry:
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	if (error != 0) {
+		/*
+		 * This is ugly: dmu_objset_open() can return EBUSY if
+		 * the objset is held exclusively. Fortunately this hold is
+		 * only for a short while, so we retry here.
+		 * This avoids user code having to handle EBUSY,
+		 * for example for a "zfs list".
+		 */
+		if (error == EBUSY) {
+			delay(1);
+			goto retry;
+		}
+		return (error);
+	}
+
+	dmu_objset_stats(os, &zc->zc_objset_stats);
+
+	switch (zc->zc_objset_stats.dds_type) {
+
+	case DMU_OST_ZFS:
+		error = zfs_get_stats(zc);
+		break;
+
+	case DMU_OST_ZVOL:
+		error = zvol_get_stats(zc, os);
+		break;
+	}
+
+	dmu_objset_close(os);
+	return (error);
+}
+
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+	dsl_dir_t *dd;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+	int error;
+	char *p;
+
+	dd = dsl_dir_open(zc->zc_name, FTAG, NULL);
+	if (dd == NULL)
+		return (ESRCH);
+
+	if (dd->dd_phys->dd_child_dir_zapobj == 0) {
+		dsl_dir_close(dd, FTAG);
+		return (ESRCH);
+	}
+
+	p = strrchr(zc->zc_name, '/');
+	if (p == NULL || p[1] != '\0')
+		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+	p = zc->zc_name + strlen(zc->zc_name);
+
+	do {
+		zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj, zc->zc_cookie);
+
+		error = zap_cursor_retrieve(&cursor, &attr);
+		if (error == ENOENT)
+			error = ESRCH;
+		if (error != 0) {
+			dsl_dir_close(dd, FTAG);
+			*p = '\0';
+			return (error);
+		}
+
+		(void) strlcpy(p, attr.za_name, sizeof (zc->zc_name) -
+		    (p - zc->zc_name));
+
+		zap_cursor_advance(&cursor);
+		zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+	} while (!INGLOBALZONE(curproc) &&
+	    !zone_dataset_visible(zc->zc_name, NULL));
+
+	dsl_dir_close(dd, FTAG);
+
+	/*
+	 * If it's a hidden dataset, don't try to get stats for it.
+	 * User land will skip over it.
+	 */
+	if (strchr(zc->zc_name, '$') != NULL)
+		return (0);
+
+	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+	return (error);
+}
+
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+	dsl_dataset_t *ds;
+	int error;
+
+retry:
+	error = dsl_dataset_open(zc->zc_name,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+	if (error) {
+		/*
+		 * This is ugly: dsl_dataset_open() can return EBUSY if
+		 * the objset is held exclusively. Fortunately this hold is
+		 * only for a short while, so we retry here.
+		 * This avoids user code having to handle EBUSY,
+		 * for example for a "zfs list -s".
+		 */
+		if (error == EBUSY) {
+			delay(1);
+			goto retry;
+		}
+		if (error == ENOENT)
+			return (ESRCH);
+		return (error);
+	}
+
+	/*
+	 * If ds_snapnames_zapobj is 0, someone is trying to iterate over
+	 * snapshots of a snapshot.  In this case, pretend that it has no
+	 * snapshots; otherwise zap_cursor_retrieve() will blow up.
+	 */
+	if (ds->ds_phys->ds_snapnames_zapobj == 0) {
+		error = ESRCH;
+		goto out;
+	}
+
+	zap_cursor_init_serialized(&cursor,
+	    ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, zc->zc_cookie);
+
+	error = zap_cursor_retrieve(&cursor, &attr);
+	if (error == ENOENT)
+		error = ESRCH;
+	if (error != 0)
+		goto out;
+
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    sizeof (zc->zc_name) ||
+	    strlcat(zc->zc_name, attr.za_name, sizeof (zc->zc_name)) >=
+	    sizeof (zc->zc_name)) {
+		error = ENAMETOOLONG;
+		goto out;
+	}
+
+	zap_cursor_advance(&cursor);
+	zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+
+out:
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+	return (dsl_prop_set(zc->zc_name, zc->zc_prop_name,
+	    zc->zc_intsz, zc->zc_numints, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_set_quota(zfs_cmd_t *zc)
+{
+	return (dsl_dir_set_quota(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_reservation(zfs_cmd_t *zc)
+{
+	return (dsl_dir_set_reservation(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_volsize(zfs_cmd_t *zc)
+{
+	return (zvol_set_volsize(zc));
+}
+
+static int
+zfs_ioc_set_volblocksize(zfs_cmd_t *zc)
+{
+	return (zvol_set_volblocksize(zc));
+}
+
+static int
+zfs_ioc_create_minor(zfs_cmd_t *zc)
+{
+	return (zvol_create_minor(zc));
+}
+
+static int
+zfs_ioc_remove_minor(zfs_cmd_t *zc)
+{
+	return (zvol_remove_minor(zc));
+}
+
+/*
+ * Search the vfs list for a specified resource.  Returns a pointer to it
+ * or NULL if no suitable entry is found. The caller of this routine
+ * is responsible for releasing the returned vfs pointer.
+ */
+static vfs_t *
+zfs_get_vfs(const char *resource)
+{
+	struct vfs *vfsp;
+	struct vfs *vfs_found = NULL;
+
+	vfs_list_read_lock();
+	vfsp = rootvfs;
+	do {
+		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
+			VFS_HOLD(vfsp);
+			vfs_found = vfsp;
+			break;
+		}
+		vfsp = vfsp->vfs_next;
+	} while (vfsp != rootvfs);
+	vfs_list_unlock();
+	return (vfs_found);
+}
+
+static void
+zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+	zfs_cmd_t *zc = arg;
+	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+}
+
+static int
+zfs_ioc_create(zfs_cmd_t *zc)
+{
+	objset_t *clone;
+	int error = 0;
+	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	dmu_objset_type_t type = zc->zc_objset_type;
+
+	switch (type) {
+
+	case DMU_OST_ZFS:
+		cbfunc = zfs_create_cb;
+		break;
+
+	case DMU_OST_ZVOL:
+		cbfunc = zvol_create_cb;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+
+	if (zc->zc_filename[0] != '\0') {
+		/*
+		 * We're creating a clone of an existing snapshot.
+		 */
+		zc->zc_filename[sizeof (zc->zc_filename) - 1] = '\0';
+		if (dataset_namecheck(zc->zc_filename, NULL, NULL) != 0)
+			return (EINVAL);
+
+		error = dmu_objset_open(zc->zc_filename, type,
+		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+		if (error)
+			return (error);
+		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
+		dmu_objset_close(clone);
+	} else if (strchr(zc->zc_name, '@') != 0) {
+		/*
+		 * We're taking a snapshot of an existing dataset.
+		 */
+		error = dmu_objset_create(zc->zc_name, type, NULL, NULL, NULL);
+	} else {
+		/*
+		 * We're creating a new dataset.
+		 */
+		if (type == DMU_OST_ZVOL) {
+			if ((error = zvol_check_volsize(zc)) != 0)
+				return (error);
+			if ((error = zvol_check_volblocksize(zc)) != 0)
+				return (error);
+		}
+		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, zc);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+	if (strchr(zc->zc_name, '@') != NULL &&
+	    zc->zc_objset_type == DMU_OST_ZFS) {
+		vfs_t *vfsp;
+		int err;
+
+		/*
+		 * Snapshots under .zfs control must be unmounted
+		 * before they can be destroyed.
+		 */
+		if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+			/*
+			 * Always force the unmount for snapshots.
+			 */
+			int flag = MS_FORCE;
+
+			if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+				VFS_RELE(vfsp);
+				return (err);
+			}
+			VFS_RELE(vfsp);
+			if ((err = dounmount(vfsp, flag, kcred)) != 0)
+				return (err);
+		}
+	}
+
+	return (dmu_objset_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_rollback(zfs_cmd_t *zc)
+{
+	return (dmu_objset_rollback(zc->zc_name));
+}
+
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+	zc->zc_prop_value[sizeof (zc->zc_prop_value) - 1] = '\0';
+	if (dataset_namecheck(zc->zc_prop_value, NULL, NULL) != 0)
+		return (EINVAL);
+
+	if (strchr(zc->zc_name, '@') != NULL &&
+	    zc->zc_objset_type == DMU_OST_ZFS) {
+		vfs_t *vfsp;
+		int err;
+
+		/*
+		 * Snapshots under .zfs control must be unmounted
+		 * before they can be renamed.
+		 */
+		if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+			/*
+			 * Always force the unmount for snapshots.
+			 */
+			int flag = MS_FORCE;
+
+			if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+				VFS_RELE(vfsp);
+				return (err);
+			}
+			VFS_RELE(vfsp);
+			if ((err = dounmount(vfsp, flag, kcred)) != 0)
+				return (err);
+		}
+	}
+
+	return (dmu_objset_rename(zc->zc_name, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_recvbackup(zfs_cmd_t *zc)
+{
+	file_t *fp;
+	int error, fd;
+
+	fd = zc->zc_cookie;
+	fp = getf(fd);
+	if (fp == NULL)
+		return (EBADF);
+	error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
+	    fp->f_vnode, fp->f_offset);
+	releasef(fd);
+	return (error);
+}
+
+static int
+zfs_ioc_sendbackup(zfs_cmd_t *zc)
+{
+	objset_t *fromsnap = NULL;
+	objset_t *tosnap;
+	file_t *fp;
+	int error;
+
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+	if (error)
+		return (error);
+
+	if (zc->zc_prop_value[0] != '\0') {
+		error = dmu_objset_open(zc->zc_prop_value, DMU_OST_ANY,
+		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+		if (error) {
+			dmu_objset_close(tosnap);
+			return (error);
+		}
+	}
+
+	fp = getf(zc->zc_cookie);
+	if (fp == NULL) {
+		dmu_objset_close(tosnap);
+		if (fromsnap)
+			dmu_objset_close(fromsnap);
+		return (EBADF);
+	}
+
+	error = dmu_sendbackup(tosnap, fromsnap, fp->f_vnode);
+
+	releasef(zc->zc_cookie);
+	if (fromsnap)
+		dmu_objset_close(fromsnap);
+	dmu_objset_close(tosnap);
+	return (error);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[] = {
+	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
+	{ zfs_ioc_pool_guid,		zfs_secpolicy_read,	pool_name },
+	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
+	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
+	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
+	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_set_prop,		zfs_secpolicy_setprop,	dataset_name },
+	{ zfs_ioc_set_quota,		zfs_secpolicy_quota,	dataset_name },
+	{ zfs_ioc_set_reservation,	zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_set_volsize,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_set_volblocksize,	zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
+	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
+	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_sendbackup,		zfs_secpolicy_write,	dataset_name },
+};
+
+static int
+zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+	zfs_cmd_t *zc;
+	uint_t vec;
+	int error;
+
+	if (getminor(dev) != 0)
+		return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
+
+	vec = cmd - ZFS_IOC;
+
+	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+		return (EINVAL);
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+	error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
+
+	if (error == 0) {
+		zc->zc_cred = (uintptr_t)cr;
+		zc->zc_dev = dev;
+		error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name,
+		    zc->zc_prop_name, cr);
+	}
+
+	/*
+	 * Ensure that all pool/dataset names are valid before we pass down to
+	 * the lower layers.
+	 */
+	if (error == 0) {
+		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+		switch (zfs_ioc_vec[vec].zvec_namecheck) {
+		case pool_name:
+			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+				error = EINVAL;
+			break;
+
+		case dataset_name:
+			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+				error = EINVAL;
+			break;
+		}
+	}
+
+	if (error == 0)
+		error = zfs_ioc_vec[vec].zvec_func(zc);
+
+	if (error == 0 || error == ENOMEM) {
+		int rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
+		if (error == 0)
+			error = rc;
+	}
+
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (error);
+}
+
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+	    DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	zfs_dip = dip;
+
+	ddi_report_dev(dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (spa_busy() || zfs_busy() || zvol_busy())
+		return (DDI_FAILURE);
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	zfs_dip = NULL;
+
+	ddi_prop_remove_all(dip);
+	ddi_remove_minor_node(dip, NULL);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = zfs_dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)getminor((dev_t)arg);
+		return (DDI_SUCCESS);
+	}
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+static struct cb_ops zfs_cb_ops = {
+	zvol_open,	/* open */
+	zvol_close,	/* close */
+	zvol_strategy,	/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	zvol_read,	/* read */
+	zvol_write,	/* write */
+	zfsdev_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	nodev,		/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,	/* prop_op */
+	NULL,		/* streamtab */
+	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
+	CB_REV,		/* version */
+	zvol_aread,	/* async read */
+	zvol_awrite,	/* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+	DEVO_REV,	/* version */
+	0,		/* refcnt */
+	zfs_info,	/* info */
+	nulldev,	/* identify */
+	nulldev,	/* probe */
+	zfs_attach,	/* attach */
+	zfs_detach,	/* detach */
+	nodev,		/* reset */
+	&zfs_cb_ops,	/* driver operations */
+	NULL		/* no bus operations */
+};
+
+static struct modldrv zfs_modldrv = {
+	&mod_driverops, "ZFS storage pool version 1", &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&zfs_modlfs,
+	(void *)&zfs_modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+	ASSERT(error == 0);
+
+	spa_init(FREAD | FWRITE);
+	zfs_init();
+	zvol_init();
+
+	return (0);
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	if (spa_busy() || zfs_busy() || zvol_busy())
+		return (EBUSY);
+
+	if ((error = mod_remove(&modlinkage)) != 0)
+		return (error);
+
+	zvol_fini();
+	zfs_fini();
+	spa_fini();
+
+	ldi_ident_release(zfs_li);
+	zfs_li = NULL;
+
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 0000000000..dbfd87f67a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/ddi.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * a intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * transactions.
+ */
+uint64_t
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	lr->lr_uid = zp->z_phys->zp_uid;
+	lr->lr_gid = zp->z_phys->zp_gid;
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	lr->lr_rdev = zp->z_phys->zp_rdev;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+uint64_t
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_remove_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_remove_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+uint64_t
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_link_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_link_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_link_obj = zp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+uint64_t
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+	size_t linksize = strlen(link) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	lr->lr_uid = zp->z_phys->zp_uid;
+	lr->lr_gid = zp->z_phys->zp_gid;
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	bcopy(name, (char *)(lr + 1), namesize);
+	bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+uint64_t
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_rename_t *lr;
+	size_t snamesize = strlen(sname) + 1;
+	size_t dnamesize = strlen(dname) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+	lr = (lr_rename_t *)&itx->itx_lr;
+	lr->lr_sdoid = sdzp->z_id;
+	lr->lr_tdoid = tdzp->z_id;
+	bcopy(sname, (char *)(lr + 1), snamesize);
+	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	sdzp->z_last_itx = seq;
+	tdzp->z_last_itx = seq;
+	szp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ *
+ * We store data in the log buffers if it small enough.
+ * Otherwise we flush the data out via dmu_sync().
+ */
+ssize_t zfs_immediate_write_sz = 65536;
+
+uint64_t
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_write_t *lr;
+	int dlen, err;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	dlen = (len <= zfs_immediate_write_sz ? len : 0);
+	itx = zil_itx_create(txtype, sizeof (*lr) + dlen);
+	itx->itx_data_copied = 0;
+	if ((ioflag & FDSYNC) && (dlen != 0)) {
+		err = xcopyin(uio->uio_iov->iov_base - len,
+		    (char *)itx + offsetof(itx_t, itx_lr) +  sizeof (*lr),
+		    len);
+		/*
+		 * copyin shouldn't fault as we've already successfully
+		 * copied it to a dmu buffer. However if it does we'll get
+		 * the data from the dmu later.
+		 */
+		if (!err)
+			itx->itx_data_copied = 1;
+	}
+	lr = (lr_write_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	itx->itx_private = zp->z_zfsvfs;
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+uint64_t
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, uint64_t off, uint64_t len)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_truncate_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+uint64_t
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, vattr_t *vap, uint_t mask_applied)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_setattr_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_setattr_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mask = (uint64_t)mask_applied;
+	lr->lr_mode = (uint64_t)vap->va_mode;
+	lr->lr_uid = (uint64_t)vap->va_uid;
+	lr->lr_gid = (uint64_t)vap->va_gid;
+	lr->lr_size = (uint64_t)vap->va_size;
+	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+uint64_t
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, int aclcnt, ace_t *z_ace)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_acl_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+	lr = (lr_acl_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_aclcnt = (uint64_t)aclcnt;
+	bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 0000000000..cd5a3848cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+	bzero(vap, sizeof (*vap));
+	vap->va_mask = (uint_t)mask;
+	vap->va_type = IFTOVT(mode);
+	vap->va_mode = mode & MODEMASK;
+	vap->va_uid = (uid_t)uid;
+	vap->va_gid = (gid_t)gid;
+	vap->va_rdev = (dev_t)rdev;
+	vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+	return (ENOTSUP);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
+	char *link;			/* symlink content follows name */
+	znode_t *dzp;
+	vnode_t *vp = NULL;
+	vattr_t va;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time and generation number.  The generic VOP_CREATE()
+	 * doesn't have either concept, so we smuggle the values inside
+	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
+	 */
+	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
+	va.va_nblocks = lr->lr_gen;
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE:
+		error = VOP_CREATE(ZTOV(dzp), name, &va, 0, 0, &vp, kcred, 0);
+		break;
+	case TX_MKDIR:
+		error = VOP_MKDIR(ZTOV(dzp), name, &va, &vp, kcred);
+		break;
+	case TX_MKXATTR:
+		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+		break;
+	case TX_SYMLINK:
+		link = name + strlen(name) + 1;
+		error = VOP_SYMLINK(ZTOV(dzp), name, &va, link, kcred);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+	if (error == 0 && vp != NULL)
+		VN_RELE(vp);
+
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
+	znode_t *dzp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_REMOVE:
+		error = VOP_REMOVE(ZTOV(dzp), name, kcred);
+		break;
+	case TX_RMDIR:
+		error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
+	znode_t *dzp, *zp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+		VN_RELE(ZTOV(dzp));
+		return (error);
+	}
+
+	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred);
+
+	VN_RELE(ZTOV(zp));
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	znode_t *sdzp, *tdzp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+		VN_RELE(ZTOV(sdzp));
+		return (error);
+	}
+
+	error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred);
+
+	VN_RELE(ZTOV(tdzp));
+	VN_RELE(ZTOV(sdzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
+	znode_t	*zp;
+	int error;
+	ssize_t resid;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	flock64_t fl;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&fl, sizeof (fl));
+	fl.l_type = F_WRLCK;
+	fl.l_whence = 0;
+	fl.l_start = lr->lr_offset;
+	fl.l_len = lr->lr_length;
+
+	error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+	    lr->lr_offset, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	vattr_t va;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+	va.va_size = lr->lr_size;
+	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
+	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+
+	error = VOP_SETATTR(ZTOV(zp), &va, 0, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_ace_byteswap(ace, lr->lr_aclcnt);
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentp = ace;
+
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+	zfs_replay_error,	/* 0 no such transaction type */
+	zfs_replay_create,	/* TX_CREATE */
+	zfs_replay_create,	/* TX_MKDIR */
+	zfs_replay_create,	/* TX_MKXATTR */
+	zfs_replay_create,	/* TX_SYMLINK */
+	zfs_replay_remove,	/* TX_REMOVE */
+	zfs_replay_remove,	/* TX_RMDIR */
+	zfs_replay_link,	/* TX_LINK */
+	zfs_replay_rename,	/* TX_RENAME */
+	zfs_replay_write,	/* TX_WRITE */
+	zfs_replay_truncate,	/* TX_TRUNCATE */
+	zfs_replay_setattr,	/* TX_SETATTR */
+	zfs_replay_acl,		/* TX_ACL */
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 0000000000..502bcf39bf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,1072 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_znode.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/varargs.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/mkdev.h>
+#include <sys/modctl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+
+int zfsfstype;
+vfsops_t *zfs_vfsops = NULL;
+static major_t	zfs_major;
+static minor_t zfs_minor;
+static kmutex_t	zfs_dev_mtx;
+
+static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
+static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
+static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
+static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
+static void zfs_freevfs(vfs_t *vfsp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+
+static const fs_operation_def_t zfs_vfsops_template[] = {
+	VFSNAME_MOUNT, zfs_mount,
+	VFSNAME_UNMOUNT, zfs_umount,
+	VFSNAME_ROOT, zfs_root,
+	VFSNAME_STATVFS, zfs_statvfs,
+	VFSNAME_SYNC, (fs_generic_func_p) zfs_sync,
+	VFSNAME_VGET, zfs_vget,
+	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+	NULL, NULL
+};
+
+static const fs_operation_def_t zfs_vfsops_eio_template[] = {
+	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+	NULL, NULL
+};
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t	zfs_active_fs_count = 0;
+
+static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
+static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
+
+static mntopt_t mntopts[] = {
+	{ MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL },
+	{ MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL },
+	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
+};
+
+static mntopts_t zfs_mntopts = {
+	sizeof (mntopts) / sizeof (mntopt_t),
+	mntopts
+};
+
+/*ARGSUSED*/
+int
+zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
+{
+	/*
+	 * Data integrity is job one.  We don't want a compromised kernel
+	 * writing to the storage pool, so we never sync during panic.
+	 */
+	if (panicstr)
+		return (0);
+
+	/*
+	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
+	 * to sync metadata, which they would otherwise cache indefinitely.
+	 * Semantically, the only requirement is that the sync be initiated.
+	 * The DMU syncs out txgs frequently, so there's nothing to do.
+	 */
+	if (flag & SYNC_ATTR)
+		return (0);
+
+	if (vfsp != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		ZFS_ENTER(zfsvfs);
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC);
+		else
+			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(1M).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == TRUE) {
+		zfsvfs->z_atime = TRUE;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+	} else {
+		zfsvfs->z_atime = FALSE;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval < SPA_MINBLOCKSIZE ||
+	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
+		newval = SPA_MAXBLOCKSIZE;
+
+	zfsvfs->z_max_blksz = newval;
+	zfsvfs->z_vfs->vfs_bsize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval) {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+		(void) zfs_delete_thread_target(zfsvfs, 0);
+	} else {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+		(void) zfs_delete_thread_target(zfsvfs, 1);
+	}
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
+	}
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+	}
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+	}
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_inherit = newval;
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = NULL;
+	znode_t		*zp = NULL;
+	vnode_t		*vp = NULL;
+	objset_t	*os = NULL;
+	struct dsl_dataset *ds;
+	char		*osname;
+	uint64_t	readonly, recordsize;
+	pathname_t	spn;
+	dev_t		mount_dev;
+	major_t		new_major;
+	int		mode;
+	int		error = 0;
+	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
+				UIO_SYSSPACE : UIO_USERSPACE;
+	int		canwrite;
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_REMOUNT) == 0 &&
+	    (uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * ZFS does not support passing unparsed data in via MS_DATA.
+	 * Users should use the MS_OPTIONSTR interface; this means
+	 * that all option parsing is already done and the options struct
+	 * can be interrogated.
+	 */
+	if ((uap->flags & MS_DATA) && uap->datalen > 0)
+		return (EINVAL);
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (uap->flags & MS_REMOUNT) {
+		zfsvfs = vfsp->vfs_data;
+
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+			readonly_changed_cb(zfsvfs, B_TRUE);
+		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+			if (dmu_objset_is_snapshot(zfsvfs->z_os))
+				return (EROFS);
+			readonly_changed_cb(zfsvfs, B_FALSE);
+		}
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+			devices_changed_cb(zfsvfs, B_FALSE);
+			setuid_changed_cb(zfsvfs, B_FALSE);
+		} else {
+			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+				devices_changed_cb(zfsvfs, B_FALSE);
+			else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+				devices_changed_cb(zfsvfs, B_TRUE);
+
+			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+				setuid_changed_cb(zfsvfs, B_FALSE);
+			else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+				setuid_changed_cb(zfsvfs, B_TRUE);
+		}
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+			exec_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+			exec_changed_cb(zfsvfs, B_TRUE);
+
+		return (0);
+	}
+
+	/*
+	 * Get the objset name (the "special" mount argument).
+	 */
+	if (error = pn_get(uap->spec, fromspace, &spn))
+		return (error);
+
+	osname = spn.pn_path;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		goto out;
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = EPERM;
+		goto out;
+	}
+
+	/*
+	 * Initialize the zfs-specific filesystem structure.
+	 * Should probably make this a kmem cache, shuffle fields,
+	 * and just bzero upto z_hold_mtx[].
+	 */
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	zfsvfs->z_vfs = vfsp;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_assign = TXG_NOWAIT;
+	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = VISIBLE;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+
+	/*
+	 * Initialize the generic filesystem structure.
+	 */
+	vfsp->vfs_bcount = 0;
+	vfsp->vfs_data = NULL;
+
+	/*
+	 * Create a unique device for the mount.
+	 */
+	do {
+		ASSERT3U(zfs_minor, <=, MAXMIN32);
+		int start = zfs_minor;
+		do {
+			mutex_enter(&zfs_dev_mtx);
+			zfs_minor++;
+			if (zfs_minor > MAXMIN32)
+				zfs_minor = 0;
+			mount_dev = makedevice(zfs_major, zfs_minor);
+			mutex_exit(&zfs_dev_mtx);
+		} while (vfs_devismounted(mount_dev) && zfs_minor != start);
+		if (zfs_minor == start) {
+			/*
+			 * We are using all ~262,000 minor numbers
+			 * for the current major number.  Create a
+			 * new major number.
+			 */
+			if ((new_major = getudev()) == (major_t)-1) {
+				cmn_err(CE_WARN,
+				    "zfs_mount: Can't get unique"
+				    " major device number.");
+				goto out;
+			}
+			mutex_enter(&zfs_dev_mtx);
+			zfs_major = new_major;
+			zfs_minor = 0;
+			mutex_exit(&zfs_dev_mtx);
+		} else {
+			break;
+		}
+		/* CONSTANTCONDITION */
+	} while (1);
+
+	ASSERT(vfs_devismounted(mount_dev) == 0);
+
+	if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
+		recordsize = SPA_MAXBLOCKSIZE;
+
+	vfsp->vfs_dev = mount_dev;
+	vfsp->vfs_fstype = zfsfstype;
+	vfsp->vfs_bsize = recordsize;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfsp->vfs_data = zfsvfs;
+
+	error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
+	if (error)
+		goto out;
+
+	if (readonly)
+		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+	else
+		mode = DS_MODE_PRIMARY;
+
+	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+	if (error == EROFS) {
+		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
+		    &zfsvfs->z_os);
+	}
+	os = zfsvfs->z_os;
+
+	if (error)
+		goto out;
+
+	if (error = zfs_init_fs(zfsvfs, &zp, cr))
+		goto out;
+
+	if (dmu_objset_is_snapshot(os)) {
+		ASSERT(mode & DS_MODE_READONLY);
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		zfsvfs->z_issnap = B_TRUE;
+	} else {
+		int do_readonly = FALSE, readonly;
+		int do_setuid = FALSE, setuid;
+		int do_exec = FALSE, exec;
+		int do_devices = FALSE, devices;
+
+		/*
+		 * Start a delete thread running.
+		 */
+		(void) zfs_delete_thread_target(zfsvfs, 1);
+
+		/*
+		 * Parse and replay the intent log.
+		 */
+		zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
+		    (void (*)(void *))zfs_delete_wait_empty);
+
+		if (!zil_disable)
+			zfsvfs->z_log = zil_open(os, zfs_get_data);
+
+		/*
+		 * The act of registering our callbacks will destroy any mount
+		 * options we may have.  In order to enable temporary overrides
+		 * of mount options, we stash away the current values and
+		 * restore them after we register the callbacks.
+		 */
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+			readonly = B_TRUE;
+			do_readonly = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+			readonly = B_FALSE;
+			do_readonly = B_TRUE;
+		}
+		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+			devices = B_FALSE;
+			setuid = B_FALSE;
+			do_devices = B_TRUE;
+			do_setuid = B_TRUE;
+		} else {
+			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+				devices = B_FALSE;
+				do_devices = B_TRUE;
+			} else if (vfs_optionisset(vfsp,
+			    MNTOPT_DEVICES, NULL)) {
+				devices = B_TRUE;
+				do_devices = B_TRUE;
+			}
+
+			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+				setuid = B_FALSE;
+				do_setuid = B_TRUE;
+			} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+				setuid = B_TRUE;
+				do_setuid = B_TRUE;
+			}
+		}
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+			exec = B_FALSE;
+			do_exec = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+			exec = B_TRUE;
+			do_exec = B_TRUE;
+		}
+
+		/*
+		 * Register property callbacks.
+		 */
+		ds = dmu_objset_ds(os);
+		VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "aclinherit",
+		    acl_inherit_changed_cb, zfsvfs) == 0);
+
+
+		/*
+		 * Invoke our callbacks to restore temporary mount options.
+		 */
+		if (do_readonly)
+			readonly_changed_cb(zfsvfs, readonly);
+		if (do_setuid)
+			setuid_changed_cb(zfsvfs, setuid);
+		if (do_exec)
+			exec_changed_cb(zfsvfs, exec);
+		if (do_devices)
+			devices_changed_cb(zfsvfs, devices);
+	}
+
+	vp = ZTOV(zp);
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		if (zp)
+			VN_RELE(vp);
+
+		if (zfsvfs) {
+			if (os)
+				dmu_objset_close(os);
+			kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		}
+	} else {
+		atomic_add_32(&zfs_active_fs_count, 1);
+		VN_RELE(vp);
+	}
+
+	pn_free(&spn);
+	return (error);
+}
+
+static int
+zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	dmu_objset_stats_t dstats;
+	dev32_t d32;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_stats(zfsvfs->z_os, &dstats);
+
+	/*
+	 * The underlying storage pool actually uses multiple block sizes.
+	 * We report the fragsize as the smallest block size we support,
+	 * and we report our blocksize as the filesystem's maximum blocksize.
+	 */
+	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
+	statp->f_bsize = zfsvfs->z_max_blksz;
+
+	/*
+	 * The following report "total" blocks of various kinds in the
+	 * file system, but reported in terms of f_frsize - the
+	 * "fragment" size.
+	 */
+
+	statp->f_blocks =
+	    (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT;
+	statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of object available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree);
+	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
+	statp->f_files = statp->f_ffree + dstats.dds_objects_used;
+
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	statp->f_fsid = d32;
+
+	/*
+	 * We're a zfs filesystem.
+	 */
+	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
+
+	statp->f_flag = 0;
+
+	statp->f_namemax = ZFS_MAXNAMELEN;
+
+	/*
+	 * We have all of 32 characters to stuff a string here.
+	 * Is there anything useful we could/should provide?
+	 */
+	bzero(statp->f_fstr, sizeof (statp->f_fstr));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*vpp = ZTOV(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	int ret;
+
+	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (ret);
+
+	/*
+	 * Unmount any snapshots mounted under .zfs before unmounting the
+	 * dataset itself.
+	 */
+	if (zfsvfs->z_ctldir != NULL &&
+	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+		return (ret);
+
+	if (fflag & MS_FORCE) {
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		zfsvfs->z_unmounted1 = B_TRUE;
+
+		/*
+		 * Wait for all zfs threads to leave zfs.
+		 * Grabbing a rwlock as reader in all vops and
+		 * as writer here doesn't work because it too easy to get
+		 * multiple reader enters as zfs can re-enter itself.
+		 * This can lead to deadlock if there is an intervening
+		 * rw_enter as writer.
+		 * So a file system threads ref count (z_op_cnt) is used.
+		 * A polling loop on z_op_cnt may seem inefficient, but
+		 * - this saves all threads on exit from having to grab a
+		 *   mutex in order to cv_signal
+		 * - only occurs on forced unmount in the rare case when
+		 *   there are outstanding threads within the file system.
+		 */
+		while (zfsvfs->z_op_cnt) {
+			delay(1);
+		}
+
+		zfs_objset_close(zfsvfs);
+
+		return (0);
+	}
+
+	zfs_zcache_flush(zfsvfs);
+
+	/*
+	 * Stop all delete threads.
+	 */
+	(void) zfs_delete_thread_target(zfsvfs, 0);
+
+	/*
+	 * Check the number of active vnodes in the file system.
+	 * Our count is maintained in the vfs structure, but the number
+	 * is off by 1 to indicate a hold on the vfs structure itself.
+	 *
+	 * The '.zfs' directory maintains a reference of its own, and any active
+	 * references underneath are reflected in the vnode count.
+	 */
+	if (zfsvfs->z_ctldir == NULL) {
+		if (vfsp->vfs_count > 1) {
+			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+				(void) zfs_delete_thread_target(zfsvfs, 1);
+			return (EBUSY);
+		}
+	} else {
+		if (vfsp->vfs_count > 2 ||
+		    (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) {
+			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+				(void) zfs_delete_thread_target(zfsvfs, 1);
+			return (EBUSY);
+		}
+	}
+
+	vfsp->vfs_flag |= VFS_UNMOUNTED;
+	zfs_objset_close(zfsvfs);
+
+	/*
+	 * We can now safely destroy the '.zfs' directory node, which will
+	 * release its hold on the vfs_t.
+	 */
+	if (zfsvfs->z_ctldir != NULL)
+		zfsctl_destroy(zfsvfs);
+
+	return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int 		i, err;
+
+	*vpp = NULL;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		ZFS_EXIT(zfsvfs);
+
+		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+		if (err)
+			return (EINVAL);
+		ZFS_ENTER(zfsvfs);
+	}
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/* A zero fid_gen means we are in the .zfs control directories */
+	if (fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+		*vpp = zfsvfs->z_ctldir;
+		ASSERT(*vpp != NULL);
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
+			    0, NULL, NULL) == 0);
+		} else {
+			VN_HOLD(*vpp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+	if (err = zfs_zget(zfsvfs, object, &zp)) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+	zp_gen = zp->z_phys->zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if (zp->z_reap || zp_gen != fid_gen) {
+		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+		VN_RELE(ZTOV(zp));
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static void
+zfs_objset_close(zfsvfs_t *zfsvfs)
+{
+	zfs_delete_t	*zd = &zfsvfs->z_delete_head;
+	znode_t		*zp, *nextzp;
+	objset_t	*os = zfsvfs->z_os;
+	struct dsl_dataset *ds;
+
+	/*
+	 * Stop all delete threads.
+	 */
+	(void) zfs_delete_thread_target(zfsvfs, 0);
+
+	/*
+	 * For forced unmount, at this point all vops except zfs_inactive
+	 * are erroring EIO. We need to now suspend zfs_inactive threads
+	 * while we are freeing dbufs before switching zfs_inactive
+	 * to use behaviour without a objset.
+	 */
+	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+
+	zfs_zcache_flush(zfsvfs);
+
+	/*
+	 * Release all delete in progress znodes
+	 * They will be processed when the file system remounts.
+	 */
+	mutex_enter(&zd->z_mutex);
+	while (zp = list_head(&zd->z_znodes)) {
+		list_remove(&zd->z_znodes, zp);
+		zp->z_dbuf_held = 0;
+		dmu_buf_rele(zp->z_dbuf);
+	}
+	mutex_exit(&zd->z_mutex);
+
+	/*
+	 * Release all holds on dbufs
+	 * Note, although we have stopped all other vop threads and
+	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
+	 * which can zfs_znode_free() the znode.
+	 * So we lock z_all_znodes; search the list for a held
+	 * dbuf; drop the lock (we know zp can't disappear if we hold
+	 * a dbuf lock; then regrab the lock and restart.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
+		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+		if (zp->z_dbuf_held) {
+			/* dbufs should only be held when force unmounting */
+			zp->z_dbuf_held = 0;
+			mutex_exit(&zfsvfs->z_znodes_lock);
+			dmu_buf_rele(zp->z_dbuf);
+			/* Start again */
+			mutex_enter(&zfsvfs->z_znodes_lock);
+			nextzp = list_head(&zfsvfs->z_all_znodes);
+		}
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * Unregister properties.
+	 */
+	if (!dmu_objset_is_snapshot(os)) {
+		ds = dmu_objset_ds(os);
+
+		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "aclinherit",
+		    acl_inherit_changed_cb, zfsvfs) == 0);
+	}
+
+	/*
+	 * Make the dmu drop all it dbuf holds so that zfs_inactive
+	 * can then safely free znode/vnodes.
+	 */
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	/*
+	 * Switch zfs_inactive to behaviour without an objset.
+	 * It just tosses cached pages and frees the znode & vnode.
+	 * Then re-enable zfs_inactive threads in that new behaviour.
+	 */
+	zfsvfs->z_unmounted2 = B_TRUE;
+	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+
+	/*
+	 * Close the zil. Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	/*
+	 * Finally close the objset
+	 */
+	dmu_objset_close(os);
+
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+	atomic_add_32(&zfs_active_fs_count, -1);
+}
+
+/*
+ * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
+ * so we can't safely do any non-idempotent initialization here.
+ * Leave that to zfs_init() and zfs_fini(), which are called
+ * from the module's _init() and _fini() entry points.
+ */
+/*ARGSUSED*/
+static int
+zfs_vfsinit(int fstype, char *name)
+{
+	int error;
+
+	zfsfstype = fstype;
+
+	/*
+	 * Setup vfsops and vnodeops tables.
+	 */
+	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
+	if (error != 0) {
+		cmn_err(CE_WARN, "zfs: bad vfs ops template");
+	}
+
+	error = zfs_create_op_tables();
+	if (error) {
+		zfs_remove_op_tables();
+		cmn_err(CE_WARN, "zfs: bad vnode ops template");
+		(void) vfs_freevfsops_by_type(zfsfstype);
+		return (error);
+	}
+
+	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * unique major number for all zfs mounts
+	 */
+	if ((zfs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "zfs_vfsinit: Can't get unique device number.");
+		zfs_remove_op_tables();
+		(void) vfs_freevfsops_by_type(zfsfstype);
+		return (error);
+	}
+	zfs_minor = 0;
+
+	return (0);
+}
+
+void
+zfs_init(void)
+{
+	/*
+	 * Initialize .zfs directory structures
+	 */
+	zfsctl_init();
+
+	/*
+	 * Initialize znode cache, vnode ops, etc...
+	 */
+	zfs_znode_init();
+}
+
+void
+zfs_fini(void)
+{
+	zfsctl_fini();
+	zfs_znode_fini();
+}
+
+int
+zfs_busy(void)
+{
+	return (zfs_active_fs_count != 0);
+}
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	MNTTYPE_ZFS,
+	zfs_vfsinit,
+	VSW_HASPROTO | VSW_CANRWRO | VSW_CANREMOUNT | VSW_VOLATILEDEV,
+	&zfs_mntopts
+};
+
+struct modlfs zfs_modlfs = {
+	&mod_fsops, "ZFS filesystem version 1", &vfw
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 0000000000..eb9964aa20
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,3663 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <vm/seg_vn.h>
+#include <vm/pvn.h>
+#include <vm/as.h>
+#include <sys/mman.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>  /* temporary for debugging purposes */
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait the the intent log to commit if it's is a synchronous operation.
+ * Morover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1) A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *	A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *
+ *  (2)	VN_RELE() should always be the last thing except for zil_commit()
+ *	and ZFS_EXIT(). This is for 3 reasons:
+ *	First, if it's the last reference, the vnode/znode
+ *	can be freed, so the zp may point to freed memory.  Second, the last
+ *	reference will call zfs_zinactive(), which may induce a lot of work --
+ *	pushing cached pages (which requires z_grow_lock) and syncing out
+ *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
+ *	which could deadlock the system if you were already holding one.
+ *
+ *  (3)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
+ *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
+ *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
+ *	This is critical because we don't want to block while holding locks.
+ *	Note, in particular, that if a lock is sometimes acquired before
+ *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
+ *	use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call txg_wait_open(), and try again.
+ *
+ *  (4)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *
+ *  (5)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (6)	After dropping all locks, invoke zil_commit(zilog, seq, ioflag)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
+ *	if (error) {
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		VN_RELE(...);		// release held vnodes
+ *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *			txg_wait_open(dmu_objset_pool(os), 0);
+ *			goto top;
+ *		}
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		seq = zfs_log_*(...);	// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	VN_RELE(...);			// release held vnodes
+ *	zil_commit(zilog, seq, ioflag);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+	/*
+	 * Clean up any locks held by this process on the vp.
+	 */
+	cleanlocks(vp, ddi_get_pid(), 0);
+	cleanshares(vp, ddi_get_pid());
+
+	return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, int cmd, offset_t *off)
+{
+	znode_t	*zp = VTOZ(vp);
+	uint64_t noff = (uint64_t)*off; /* new offset */
+	uint64_t file_sz;
+	int error;
+	boolean_t hole;
+
+	rw_enter(&zp->z_grow_lock, RW_READER);
+	file_sz = zp->z_phys->zp_size;
+	if (noff >= file_sz)  {
+		rw_exit(&zp->z_grow_lock);
+		return (ENXIO);
+	}
+
+	if (cmd == _FIO_SEEK_HOLE)
+		hole = B_TRUE;
+	else
+		hole = B_FALSE;
+
+	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+	rw_exit(&zp->z_grow_lock);
+
+	/* end of file? */
+	if ((error == ESRCH) || (noff > file_sz)) {
+		/*
+		 * Handle the virtual hole at the end of file.
+		 */
+		if (hole) {
+			*off = file_sz;
+			return (0);
+		}
+		return (ENXIO);
+	}
+
+	if (noff < *off)
+		return (error);
+	*off = noff;
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
+    int *rvalp)
+{
+	offset_t off;
+	int error;
+	zfsvfs_t *zfsvfs;
+
+	switch (com) {
+	    case _FIOFFS:
+		return (zfs_sync(vp->v_vfsp, 0, cred));
+
+	    case _FIO_SEEK_DATA:
+	    case _FIO_SEEK_HOLE:
+		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
+			return (EFAULT);
+
+		zfsvfs = VTOZ(vp)->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+
+		/* offset parameter is in/out */
+		error = zfs_holey(vp, com, &off);
+		ZFS_EXIT(zfsvfs);
+		if (error)
+			return (error);
+		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
+			return (EFAULT);
+		return (0);
+	}
+	return (ENOTTY);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	the file is memory mapped.
+ */
+static int
+mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int64_t	start, off;
+	int len = nbytes;
+	int error = 0;
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
+		/*
+		 * We don't want a new page to "appear" in the middle of
+		 * the file update (because it may not get the write
+		 * update data), so we grab a lock to block
+		 * zfs_getpage().
+		 */
+		rw_enter(&zp->z_map_lock, RW_WRITER);
+		if (pp = page_lookup(vp, start, SE_SHARED)) {
+			caddr_t va;
+
+			rw_exit(&zp->z_map_lock);
+			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+			error = uiomove(va+off, bytes, UIO_WRITE, uio);
+			if (error == 0) {
+				dmu_write(zfsvfs->z_os, zp->z_id,
+				    woff, bytes, va+off, tx);
+			}
+			ppmapout(va);
+			page_unlock(pp);
+		} else {
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+			    woff, bytes, uio, tx);
+			rw_exit(&zp->z_map_lock);
+		}
+		len -= bytes;
+		woff += bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio)
+{
+	int64_t	start, off, bytes;
+	int len = nbytes;
+	int error = 0;
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+
+		bytes = MIN(PAGESIZE - off, len);
+		if (pp = page_lookup(vp, start, SE_SHARED)) {
+			caddr_t va;
+
+			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+			error = uiomove(va + off, bytes, UIO_READ, uio);
+			ppmapout(va);
+			page_unlock(pp);
+		} else {
+			/* XXX use dmu_read here? */
+			error = uiomove(addr, bytes, UIO_READ, uio);
+		}
+		len -= bytes;
+		addr += bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ *	IN:	vp	- vnode of file to be read from.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Side Effects:
+ *	vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint64_t	delta;
+	ssize_t		n, size, cnt, ndone;
+	int		error, i, numbufs;
+	dmu_buf_t	*dbp, **dbpp;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Validate file offset
+	 */
+	if (uio->uio_loffset < (offset_t)0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Fasttrack empty reads
+	 */
+	if (uio->uio_resid == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Check for region locks
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+		if (error = chklock(vp, FREAD,
+		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	/*
+	 * If we're in FRSYNC mode, sync out this znode before reading it.
+	 */
+	zil_commit(zfsvfs->z_log, zp->z_last_itx, ioflag & FRSYNC);
+
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the read.
+	 */
+	rw_enter(&zp->z_grow_lock, RW_READER);
+	/*
+	 * If we are reading past end-of-file we can skip
+	 * to the end; but we might still need to set atime.
+	 */
+	if (uio->uio_loffset >= zp->z_phys->zp_size) {
+		cnt = 0;
+		error = 0;
+		goto out;
+	}
+
+	cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+	for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) {
+		ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+		n = MIN(zfs_read_chunk_size,
+		    zp->z_phys->zp_size - uio->uio_loffset);
+		n = MIN(n, cnt);
+		dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+		    uio->uio_loffset, n, &numbufs);
+		if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
+			dmu_buf_rele_array(dbpp, numbufs);
+			goto out;
+		}
+		/*
+		 * Compute the adjustment to align the dmu buffers
+		 * with the uio buffer.
+		 */
+		delta = uio->uio_loffset - dbpp[0]->db_offset;
+
+		for (i = 0; i < numbufs; i++) {
+			if (n < 0)
+				break;
+			dbp = dbpp[i];
+			size = dbp->db_size - delta;
+			/*
+			 * XXX -- this is correct, but may be suboptimal.
+			 * If the pages are all clean, we don't need to
+			 * go through mappedread().  Maybe the VMODSORT
+			 * stuff can help us here.
+			 */
+			if (vn_has_cached_data(vp)) {
+				error = mappedread(vp, (caddr_t)dbp->db_data +
+				    delta, (n < size ? n : size), uio);
+			} else {
+				error = uiomove((caddr_t)dbp->db_data + delta,
+					(n < size ? n : size), UIO_READ, uio);
+			}
+			if (error) {
+				dmu_buf_rele_array(dbpp, numbufs);
+				goto out;
+			}
+			n -= dbp->db_size;
+			if (delta) {
+				n += delta;
+				delta = 0;
+			}
+		}
+		dmu_buf_rele_array(dbpp, numbufs);
+	}
+out:
+	rw_exit(&zp->z_grow_lock);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified.
+ * Any error will exit this routine as this is only a best
+ * attempt to get the pages resident. This is a copy of ufs_trans_touch().
+ */
+static void
+zfs_prefault_write(ssize_t n, struct uio *uio)
+{
+	struct iovec *iov;
+	ulong_t cnt, incr;
+	caddr_t p;
+	uint8_t tmp;
+
+	iov = uio->uio_iov;
+
+	while (n) {
+		cnt = MIN(iov->iov_len, n);
+		if (cnt == 0) {
+			/* empty iov entry */
+			iov++;
+			continue;
+		}
+		n -= cnt;
+		/*
+		 * touch each page in this segment.
+		 */
+		p = iov->iov_base;
+		while (cnt) {
+			switch (uio->uio_segflg) {
+			case UIO_USERSPACE:
+			case UIO_USERISPACE:
+				if (fuword8(p, &tmp))
+					return;
+				break;
+			case UIO_SYSSPACE:
+				if (kcopy(p, &tmp, 1))
+					return;
+				break;
+			}
+			incr = MIN(cnt, PAGESIZE);
+			p += incr;
+			cnt -= incr;
+		}
+		/*
+		 * touch the last byte in case it straddles a page.
+		 */
+		p--;
+		switch (uio->uio_segflg) {
+		case UIO_USERSPACE:
+		case UIO_USERISPACE:
+			if (fuword8(p, &tmp))
+				return;
+			break;
+		case UIO_SYSSPACE:
+			if (kcopy(p, &tmp, 1))
+				return;
+			break;
+		}
+		iov++;
+	}
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	vp	- vnode of file to be written to.
+ *		uio	- structure supplying write location, range info,
+ *			  and data buffer.
+ *		ioflag	- FAPPEND flag set if in append mode.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated if byte count > 0
+ *
+ * Note: zfs_write() holds z_append_lock across calls to txg_wait_open().
+ * It has to because of the semantics of FAPPEND.  The implication is that
+ * we must never grab z_append_lock while in an assigned tx.
+ */
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	rlim64_t	limit = uio->uio_llimit;
+	ssize_t		start_resid = uio->uio_resid;
+	ssize_t		tx_bytes;
+	uint64_t	end_size;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	offset_t	woff;
+	ssize_t		n, nbytes;
+	int		max_blksz = zfsvfs->z_max_blksz;
+	int		need_append_lock, error;
+	krw_t		grow_rw = RW_READER;
+
+	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+		limit = MAXOFFSET_T;
+
+	n = start_resid;
+
+	/*
+	 * Fasttrack empty write
+	 */
+	if (n == 0)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages don't hold up txg
+	 */
+	zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio);
+
+	/*
+	 * If in append mode, set the io offset pointer to eof.
+	 */
+	need_append_lock = ioflag & FAPPEND;
+	if (need_append_lock) {
+		rw_enter(&zp->z_append_lock, RW_WRITER);
+		woff = uio->uio_loffset = zp->z_phys->zp_size;
+	} else {
+		woff = uio->uio_loffset;
+		/*
+		 * Validate file offset
+		 */
+		if (woff < 0) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		/*
+		 * If this write could change the file length,
+		 * we need to synchronize with "appenders".
+		 */
+		if (woff < limit - n && woff + n > zp->z_phys->zp_size) {
+			need_append_lock = TRUE;
+			rw_enter(&zp->z_append_lock, RW_READER);
+		}
+	}
+
+	if (woff >= limit) {
+		error = EFBIG;
+		goto no_tx_done;
+	}
+
+	if ((woff + n) > limit || woff > (limit - n))
+		n = limit - woff;
+
+	/*
+	 * Check for region locks
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0)
+		goto no_tx_done;
+top:
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the write.
+	 */
+	rw_enter(&zp->z_grow_lock, grow_rw);
+
+	end_size = MAX(zp->z_phys->zp_size, woff + n);
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		goto no_tx_done;
+	}
+
+	if (end_size > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) {
+		uint64_t new_blksz;
+		/*
+		 * This write will increase the file size beyond
+		 * the current block size so increase the block size.
+		 */
+		if (grow_rw == RW_READER && !rw_tryupgrade(&zp->z_grow_lock)) {
+			dmu_tx_commit(tx);
+			rw_exit(&zp->z_grow_lock);
+			grow_rw = RW_WRITER;
+			goto top;
+		}
+		if (zp->z_blksz > max_blksz) {
+			ASSERT(!ISP2(zp->z_blksz));
+			new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+		} else {
+			new_blksz = MIN(end_size, max_blksz);
+		}
+		error = zfs_grow_blocksize(zp, new_blksz, tx);
+		if (error) {
+			tx_bytes = 0;
+			goto tx_done;
+		}
+	}
+
+	if (grow_rw == RW_WRITER) {
+		rw_downgrade(&zp->z_grow_lock);
+		grow_rw = RW_READER;
+	}
+
+	/*
+	 * The file data does not fit in the znode "cache", so we
+	 * will be writing to the file block data buffers.
+	 * Each buffer will be written in a separate transaction;
+	 * this keeps the intent log records small and allows us
+	 * to do more fine-grained space accounting.
+	 */
+	while (n > 0) {
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+		rw_enter(&zp->z_map_lock, RW_READER);
+
+		tx_bytes = uio->uio_resid;
+		if (vn_has_cached_data(vp)) {
+			rw_exit(&zp->z_map_lock);
+			error = mappedwrite(vp, woff, nbytes, uio, tx);
+		} else {
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+			    woff, nbytes, uio, tx);
+			rw_exit(&zp->z_map_lock);
+		}
+		tx_bytes -= uio->uio_resid;
+
+		if (error) {
+			/* XXX - do we need to "clean up" the dmu buffer? */
+			break;
+		}
+
+		ASSERT(tx_bytes == nbytes);
+
+		n -= nbytes;
+		if (n <= 0)
+			break;
+
+		/*
+		 * We have more work ahead of us, so wrap up this transaction
+		 * and start another.  Exact same logic as tx_done below.
+		 */
+		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+			    uio->uio_loffset);
+		}
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+		    ioflag, uio);
+		dmu_tx_commit(tx);
+
+		/* Pre-fault the next set of pages */
+		zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio);
+
+		/*
+		 * Start another transaction.
+		 */
+		woff = uio->uio_loffset;
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_bonus(tx, zp->z_id);
+		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		if (error) {
+			dmu_tx_abort(tx);
+			rw_exit(&zp->z_grow_lock);
+			if (error == ERESTART &&
+			    zfsvfs->z_assign == TXG_NOWAIT) {
+				txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+				goto top;
+			}
+			goto no_tx_done;
+		}
+	}
+
+tx_done:
+
+	if (tx_bytes != 0) {
+		/*
+		 * Update the file size if it has changed; account
+		 * for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+			    uio->uio_loffset);
+		}
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+		    ioflag, uio);
+	}
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_grow_lock);
+
+no_tx_done:
+
+	if (need_append_lock)
+		rw_exit(&zp->z_append_lock);
+
+	/*
+	 * If we're in replay mode, or we made no progress, return error.
+	 * Otherwise, it's at least a partial write, so it's successful.
+	 */
+	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	zil_commit(zilog, seq, ioflag & (FSYNC | FDSYNC));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t off = lr->lr_offset;
+	int dlen = lr->lr_length;  		/* length of user data */
+	int reclen = lr->lr_common.lrc_reclen;
+	int error = 0;
+
+	ASSERT(dlen != 0);
+
+	/*
+	 * Nothing to do if the file has been removed or truncated.
+	 */
+	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+		return (ENOENT);
+	if (off >= zp->z_phys->zp_size || zp->z_reap) {
+		VN_RELE(ZTOV(zp));
+		return (ENOENT);
+	}
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
+		rw_enter(&zp->z_grow_lock, RW_READER);
+		dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
+		dmu_buf_read(db);
+		bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
+		dmu_buf_rele(db);
+		rw_exit(&zp->z_grow_lock);
+	} else {
+		/*
+		 * We have to grab z_grow_lock as RW_WRITER because
+		 * dmu_sync() can't handle concurrent dbuf_dirty() (6313856).
+		 * z_grow_lock will be replaced with a range lock soon,
+		 * which will eliminate the concurrency hit, but dmu_sync()
+		 * really needs more thought.  It shouldn't have to rely on
+		 * the caller to provide MT safety.
+		 */
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		txg_suspend(dmu_objset_pool(os));
+		error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff,
+		    &lr->lr_blkptr, lr->lr_common.lrc_txg);
+		txg_resume(dmu_objset_pool(os));
+		rw_exit(&zp->z_grow_lock);
+	}
+	VN_RELE(ZTOV(zp));
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_zaccess_rwx(zp, mode, cr);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ *	IN:	dvp	- vnode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		pnp	- full pathname to lookup [UNUSED].
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		rdir	- root directory vnode [UNUSED].
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vpp	- vnode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int	error;
+
+	ZFS_ENTER(zfsvfs);
+
+	*vpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr)) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+
+		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+			VN_RELE(*vpp);
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Check accessibility of directory.
+	 */
+
+	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
+
+		/*
+		 * Convert device special files
+		 */
+		if (IS_DEVVP(*vpp)) {
+			vnode_t	*svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL)
+				error = ENOSYS;
+			else
+				*vpp = svp;
+		}
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the vp of the created or trunc'd file.
+ *
+ *	IN:	dvp	- vnode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- large file flag [UNUSED].
+ *
+ *	OUT:	vpp	- vnode of created or trunc'd entry.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated if new entry created
+ *	 vp - ctime|mtime always, atime if new
+ */
+/* ARGSUSED */
+static int
+zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
+    int mode, vnode_t **vpp, cred_t *cr, int flag)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	objset_t	*os = zfsvfs->z_os;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	uint64_t	zoid;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	*vpp = NULL;
+
+	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
+		vap->va_mode &= ~VSVTX;
+
+	if (*name == '\0') {
+		/*
+		 * Null component name refers to the directory itself.
+		 */
+		VN_HOLD(dvp);
+		zp = dzp;
+		dl = NULL;
+		error = 0;
+	} else {
+		/* possible VN_HOLD(zp) */
+		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+			if (strcmp(name, "..") == 0)
+				error = EISDIR;
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	zoid = zp ? zp->z_id : -1ULL;
+
+	if (zp == NULL) {
+		/*
+		 * Create a new file object and update the directory
+		 * to reference it.
+		 */
+		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+			goto out;
+		}
+
+		/*
+		 * We only support the creation of regular files in
+		 * extended attribute directories.
+		 */
+		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+		    (vap->va_type != VREG)) {
+			error = EINVAL;
+			goto out;
+		}
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_zap(tx, dzp->z_id, 1);
+		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, SPA_MAXBLOCKSIZE);
+		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		if (error) {
+			dmu_tx_abort(tx);
+			zfs_dirent_unlock(dl);
+			if (error == ERESTART &&
+			    zfsvfs->z_assign == TXG_NOWAIT) {
+				txg_wait_open(dmu_objset_pool(os), 0);
+				goto top;
+			}
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+		ASSERT(zp->z_id == zoid);
+		(void) zfs_link_create(dl, zp, tx, ZNEW);
+		seq = zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+		dmu_tx_commit(tx);
+	} else {
+		/*
+		 * A directory entry already exists for this name.
+		 */
+		/*
+		 * Can't truncate an existing file if in exclusive mode.
+		 */
+		if (excl == EXCL) {
+			error = EEXIST;
+			goto out;
+		}
+		/*
+		 * Can't open a directory for writing.
+		 */
+		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
+			error = EISDIR;
+			goto out;
+		}
+		/*
+		 * Verify requested access to file.
+		 */
+		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+			goto out;
+		}
+		/*
+		 * Truncate regular files if requested.
+		 */
+
+		/*
+		 * Need to update dzp->z_seq?
+		 */
+
+		mutex_enter(&dzp->z_lock);
+		dzp->z_seq++;
+		mutex_exit(&dzp->z_lock);
+
+		if ((ZTOV(zp)->v_type == VREG) && (zp->z_phys->zp_size != 0) &&
+		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+			/*
+			 * Truncate the file.
+			 */
+			tx = dmu_tx_create(os);
+			dmu_tx_hold_bonus(tx, zoid);
+			dmu_tx_hold_free(tx, zoid, 0, DMU_OBJECT_END);
+			error = dmu_tx_assign(tx, zfsvfs->z_assign);
+			if (error) {
+				dmu_tx_abort(tx);
+				if (dl)
+					zfs_dirent_unlock(dl);
+				VN_RELE(ZTOV(zp));
+				if (error == ERESTART &&
+				    zfsvfs->z_assign == TXG_NOWAIT) {
+					txg_wait_open(dmu_objset_pool(os), 0);
+					goto top;
+				}
+				ZFS_EXIT(zfsvfs);
+				return (error);
+			}
+			/*
+			 * Grab the grow_lock to serialize this change with
+			 * respect to other file manipulations.
+			 */
+			rw_enter(&zp->z_grow_lock, RW_WRITER);
+			error = zfs_freesp(zp, 0, 0, mode, tx, cr);
+			if (error == 0) {
+				zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+				seq = zfs_log_truncate(zilog, tx,
+				    TX_TRUNCATE, zp, 0, 0);
+			}
+			rw_exit(&zp->z_grow_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+out:
+
+	if (dl)
+		zfs_dirent_unlock(dl);
+
+	if (error) {
+		if (zp)
+			VN_RELE(ZTOV(zp));
+	} else {
+		*vpp = ZTOV(zp);
+		/*
+		 * If vnode is for a device return a specfs vnode instead.
+		 */
+		if (IS_DEVVP(*vpp)) {
+			struct vnode *svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL) {
+				error = ENOSYS;
+			}
+			*vpp = svp;
+		}
+	}
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dvp	- vnode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime
+ *	 vp - ctime (if nlink > 0)
+ */
+static int
+zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	znode_t		*xzp = NULL;
+	vnode_t		*vp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	acl_obj, xattr_obj;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		may_delete_now, delete_now = FALSE;
+	int		reaped;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	vp = ZTOV(zp);
+
+	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+		goto out;
+	}
+
+	/*
+	 * Check the restrictions that apply on sticky directories.
+	 */
+	if (error = zfs_sticky_remove_access(dzp, zp, cr))
+		goto out;
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (vp->v_type == VDIR) {
+		error = EPERM;
+		goto out;
+	}
+
+	vnevent_remove(vp);
+
+	mutex_enter(&vp->v_lock);
+	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
+	mutex_exit(&vp->v_lock);
+
+	/*
+	 * We may delete the znode now, or we may put it on the delete queue;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, -1);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	if (may_delete_now)
+		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+
+	/* are there any extended attributes? */
+	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
+		/*
+		 * XXX - There is a possibility that the delete
+		 * of the parent file could succeed, but then we get
+		 * an ENOSPC when we try to delete the xattrs...
+		 * so we would need to re-try the deletes periodically
+		 */
+		/* XXX - do we need this if we are deleting? */
+		dmu_tx_hold_bonus(tx, xattr_obj);
+	}
+
+	/* are there any additional acls */
+	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
+	    may_delete_now)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		VN_RELE(vp);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dl, zp, tx, 0, &reaped);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (reaped) {
+		mutex_enter(&vp->v_lock);
+		delete_now = may_delete_now &&
+		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
+		    zp->z_phys->zp_xattr == xattr_obj &&
+		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+		mutex_exit(&vp->v_lock);
+	}
+
+	if (delete_now) {
+		if (zp->z_phys->zp_xattr) {
+			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+			ASSERT3U(error, ==, 0);
+			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
+			dmu_buf_will_dirty(xzp->z_dbuf, tx);
+			mutex_enter(&xzp->z_lock);
+			xzp->z_reap = 1;
+			xzp->z_phys->zp_links = 0;
+			mutex_exit(&xzp->z_lock);
+			zfs_dq_add(xzp, tx);
+			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+		}
+		mutex_enter(&zp->z_lock);
+		mutex_enter(&vp->v_lock);
+		vp->v_count--;
+		ASSERT3U(vp->v_count, ==, 0);
+		mutex_exit(&vp->v_lock);
+		zp->z_active = 0;
+		mutex_exit(&zp->z_lock);
+		zfs_znode_delete(zp, tx);
+		VFS_RELE(zfsvfs->z_vfs);
+	} else if (reaped) {
+		zfs_dq_add(zp, tx);
+	}
+
+	seq = zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+
+	dmu_tx_commit(tx);
+out:
+	zfs_dirent_unlock(dl);
+
+	if (!delete_now) {
+		VN_RELE(vp);
+	} else if (xzp) {
+		/* this rele delayed to prevent nesting transactions */
+		VN_RELE(ZTOV(xzp));
+	}
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dvp	- vnode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vpp	- vnode of created directory.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ *	 vp - ctime|mtime|atime updated
+ */
+static int
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	uint64_t	zoid = 0;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ASSERT(vap->va_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+top:
+	*vpp = NULL;
+	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * First make sure the new directory doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, SPA_MAXBLOCKSIZE);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	(void) zfs_link_create(dl, zp, tx, ZNEW);
+
+	*vpp = ZTOV(zp);
+
+	seq = zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dvp	- vnode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- vnode of current working directory.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+static int
+zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp;
+	vnode_t		*vp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	zp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	vp = ZTOV(zp);
+
+	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+		goto out;
+	}
+
+	/*
+	 * Check the restrictions that apply on sticky directories.
+	 */
+	if (error = zfs_sticky_remove_access(dzp, zp, cr))
+		goto out;
+
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	if (vp == cwd) {
+		error = EINVAL;
+		goto out;
+	}
+
+	vnevent_rmdir(vp);
+
+	/*
+	 * Grab a lock on the parent pointer make sure we play well
+	 * with the treewalk and directory rename code.
+	 */
+	rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_parent_lock);
+		zfs_dirent_unlock(dl);
+		VN_RELE(vp);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+
+	if (error == 0)
+		seq = zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_parent_lock);
+out:
+	zfs_dirent_unlock(dl);
+
+	VN_RELE(vp);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure.
+ *
+ *	IN:	vp	- vnode of directory to read.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *		eofp	- set to true if end-of-file detected.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp)
+{
+	znode_t		*zp = VTOZ(vp);
+	iovec_t		*iovp;
+	dirent64_t	*odp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	caddr_t		outbuf;
+	size_t		bufsize;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	uint_t		bytes_wanted;
+	ushort_t	this_reclen;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	off64_t		*next;
+	int		local_eof;
+	int		outcount = 0;
+	int		error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * If we are not given an eof variable,
+	 * use a local one.
+	 */
+	if (eofp == NULL)
+		eofp = &local_eof;
+
+	/*
+	 * Check for valid iov_len.
+	 */
+	if (uio->uio_iov->iov_len <= 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if ((*eofp = zp->z_reap) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	offset = uio->uio_loffset;
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id,
+		    offset);
+	}
+
+	/*
+	 * Get space to change directory entries into fs independent format.
+	 */
+	iovp = uio->uio_iov;
+	bytes_wanted = iovp->iov_len;
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+		bufsize = bytes_wanted;
+		outbuf = kmem_alloc(bufsize, KM_SLEEP);
+		odp = (struct dirent64 *)outbuf;
+	} else {
+		bufsize = bytes_wanted;
+		odp = (struct dirent64 *)iovp->iov_base;
+	}
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	outcount = 0;
+	while (outcount < bytes_wanted) {
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_first_integer = zp->z_id;
+			this_reclen = DIRENT64_RECLEN(1);
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_first_integer = zp->z_phys->zp_parent;
+			this_reclen = DIRENT64_RECLEN(2);
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_first_integer = ZFSCTL_INO_ROOT;
+			this_reclen =
+			    DIRENT64_RECLEN(sizeof (ZFS_CTLDIR_NAME) - 1);
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if (error = zap_cursor_retrieve(&zc, &zap)) {
+				if ((*eofp = (error == ENOENT)) != 0)
+					break;
+				else
+					goto update;
+			}
+
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers != 1) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset);
+				error = ENXIO;
+				goto update;
+			}
+			this_reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+		}
+
+		/*
+		 * Will this entry fit in the buffer?
+		 */
+		if (outcount + this_reclen > bufsize) {
+			/*
+			 * Did we manage to fit anything in the buffer?
+			 */
+			if (!outcount) {
+				error = EINVAL;
+				goto update;
+			}
+			break;
+		}
+		/*
+		 * Add this entry:
+		 */
+		odp->d_ino = (ino64_t)zap.za_first_integer;
+		odp->d_reclen = (ushort_t)this_reclen;
+		/* NOTE: d_off is the offset for the *next* entry */
+		next = &(odp->d_off);
+		(void) strncpy(odp->d_name, zap.za_name,
+		    DIRENT64_NAMELEN(this_reclen));
+		outcount += this_reclen;
+		odp = (dirent64_t *)((intptr_t)odp + this_reclen);
+
+		ASSERT(outcount <= bufsize);
+
+		/* Prefetch znode */
+		dmu_prefetch(zfsvfs->z_os, zap.za_first_integer, 0, 0);
+
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+		*next = offset;
+	}
+
+	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+		iovp->iov_base += outcount;
+		iovp->iov_len -= outcount;
+		uio->uio_resid -= outcount;
+	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+		/*
+		 * Reset the pointer.
+		 */
+		offset = uio->uio_loffset;
+	}
+
+update:
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+		kmem_free(outbuf, bufsize);
+
+	if (error == ENOENT)
+		error = 0;
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	uio->uio_loffset = offset;
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	zil_commit(zfsvfs->z_log, zp->z_last_itx, FSYNC);
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file.
+ *		vap	- va_mask identifies requested attributes.
+ *		flags	- [UNUSED]
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vap	- attribute values.
+ *
+ *	RETURN:	0 (always succeeds)
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_phys_t *pzp = zp->z_phys;
+	int	error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Return all attributes.  It's cheaper to provide the answer
+	 * than to determine whether we were asked the question.
+	 */
+	mutex_enter(&zp->z_lock);
+
+	vap->va_type = vp->v_type;
+	vap->va_mode = pzp->zp_mode & MODEMASK;
+	vap->va_uid = zp->z_phys->zp_uid;
+	vap->va_gid = zp->z_phys->zp_gid;
+	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
+	vap->va_nodeid = zp->z_id;
+	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
+	vap->va_size = pzp->zp_size;
+	vap->va_rdev = pzp->zp_rdev;
+	vap->va_seq = zp->z_seq;
+
+	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+
+	/*
+	 * Owner should be allowed to always read_attributes
+	 */
+	if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
+		if (zp->z_phys->zp_uid != crgetuid(cr)) {
+			mutex_exit(&zp->z_lock);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
+
+	if (zp->z_blksz == 0) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		vap->va_blksize = zfsvfs->z_max_blksz;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file to be modified.
+ *		vap	- new attribute values.
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+	caller_context_t *ct)
+{
+	struct znode	*zp = VTOZ(vp);
+	znode_phys_t	*pzp = zp->z_phys;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	dmu_tx_t	*tx;
+	uint_t		mask = vap->va_mask;
+	uint_t		mask_applied = 0;
+	vattr_t		oldva;
+	uint64_t	new_mode;
+	int		have_grow_lock;
+	int		need_policy = FALSE;
+	int		err;
+
+	if (mask == 0)
+		return (0);
+
+	if (mask & AT_NOSET)
+		return (EINVAL);
+
+	if (mask & AT_SIZE && vp->v_type == VDIR)
+		return (EISDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	have_grow_lock = FALSE;
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		ZFS_EXIT(zfsvfs);
+		return (EROFS);
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & AT_SIZE) {
+		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	if (mask & (AT_ATIME|AT_MTIME))
+		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+
+	if (mask & (AT_UID|AT_GID)) {
+		int	idmask = (mask & (AT_UID|AT_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+
+		/*
+		 * If both AT_UID and AT_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+		    ((idmask == AT_UID) && take_owner) ||
+		    ((idmask == AT_GID) && take_group)) {
+			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+				    secpolicy_vnode_setid_retain(cr,
+				    (vap->va_mode & S_ISUID) != 0 &&
+				    (mask & AT_UID) != 0 &&
+				    vap->va_uid == 0) != 0) {
+					vap->va_mode = pzp->zp_mode;
+					vap->va_mask |= AT_MODE;
+					vap->va_mode &= ~(S_ISUID|S_ISGID);
+				}
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	if (mask & AT_MODE)
+		need_policy = TRUE;
+
+	if (need_policy) {
+		mutex_enter(&zp->z_lock);
+		oldva.va_mode = pzp->zp_mode;
+		oldva.va_uid = zp->z_phys->zp_uid;
+		oldva.va_gid = zp->z_phys->zp_gid;
+		mutex_exit(&zp->z_lock);
+		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+
+	if (mask & AT_MODE) {
+
+		new_mode = (pzp->zp_mode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (zp->z_phys->zp_acl.z_acl_extern_obj)
+			dmu_tx_hold_write(tx,
+			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
+		else
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+	}
+
+	if (mask & AT_SIZE) {
+		uint64_t off = vap->va_size;
+		/*
+		 * Grab the grow_lock to serialize this change with
+		 * respect to other file manipulations.
+		 */
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		have_grow_lock = TRUE;
+		if (off < zp->z_phys->zp_size)
+			dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
+		else if (zp->z_phys->zp_size &&
+		    zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+			/* we will rewrite this block if we grow */
+			dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
+	}
+
+	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (err) {
+		dmu_tx_abort(tx);
+		if (have_grow_lock)
+			rw_exit(&zp->z_grow_lock);
+		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+	if (mask & AT_SIZE) {
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, tx, cr);
+		if (err) {
+			mutex_enter(&zp->z_lock);
+			goto out;
+		}
+		mask_applied |= AT_SIZE;
+	}
+
+	mask_applied = mask;	/* no errors after this point */
+
+	mutex_enter(&zp->z_lock);
+
+	if (mask & AT_MODE) {
+		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+		ASSERT3U(err, ==, 0);
+	}
+
+	if ((mask & AT_UID) && vap->va_uid != oldva.va_uid)
+		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+
+	if ((mask & AT_GID) && vap->va_gid != oldva.va_gid)
+		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+
+	if (mask & AT_ATIME)
+		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+
+	if (mask & AT_MTIME)
+		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+
+	if (mask_applied & AT_SIZE)
+		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
+	else if (mask_applied != 0)
+		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+out:
+	if (mask_applied != 0)
+		seq = zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap,
+		    mask_applied);
+
+	mutex_exit(&zp->z_lock);
+
+	if (have_grow_lock)
+		rw_exit(&zp->z_grow_lock);
+
+	dmu_tx_commit(tx);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+typedef struct zfs_zlock {
+	krwlock_t	*zl_rwlock;	/* lock we acquired */
+	znode_t		*zl_znode;	/* znode we held */
+	struct zfs_zlock *zl_next;	/* next in list */
+} zfs_zlock_t;
+
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t	*zl;
+	znode_t 	*zp = tdzp;
+	uint64_t	rootid = zp->z_zfsvfs->z_root;
+	uint64_t	*oidp = &zp->z_id;
+	krwlock_t	*rwlp = &szp->z_parent_lock;
+	krw_t		rw = RW_WRITER;
+
+	/*
+	 * First pass write-locks szp and compares to zp->z_id.
+	 * Later passes read-lock zp and compare to zp->z_parent.
+	 */
+	do {
+		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+		zl->zl_rwlock = rwlp;
+		zl->zl_znode = NULL;
+		zl->zl_next = *zlpp;
+		*zlpp = zl;
+
+		rw_enter(rwlp, rw);
+
+		if (*oidp == szp->z_id)		/* We're a descendant of szp */
+			return (EINVAL);
+
+		if (*oidp == rootid)		/* We've hit the top */
+			return (0);
+
+		if (rw == RW_READER) {		/* i.e. not the first pass */
+			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+			if (error)
+				return (error);
+			zl->zl_znode = zp;
+		}
+		oidp = &zp->z_phys->zp_parent;
+		rwlp = &zp->z_parent_lock;
+		rw = RW_READER;
+
+	} while (zp->z_id != sdzp->z_id);
+
+	return (0);
+}
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t *zl;
+
+	while ((zl = *zlpp) != NULL) {
+		if (zl->zl_znode != NULL)
+			VN_RELE(ZTOV(zl->zl_znode));
+		rw_exit(zl->zl_rwlock);
+		*zlpp = zl->zl_next;
+		kmem_free(zl, sizeof (*zl));
+	}
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdvp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdvp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	sdvp,tdvp - ctime|mtime updated
+ */
+static int
+zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
+{
+	znode_t		*tdzp, *szp, *tzp;
+	znode_t		*sdzp = VTOZ(sdvp);
+	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	vnode_t		*realvp;
+	zfs_dirlock_t	*sdl, *tdl;
+	dmu_tx_t	*tx;
+	zfs_zlock_t	*zl;
+	int		cmp, serr, terr, error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Make sure we have the real vp for the target directory.
+	 */
+	if (VOP_REALVP(tdvp, &realvp) == 0)
+		tdvp = realvp;
+
+	if (tdvp->v_vfsp != sdvp->v_vfsp) {
+		ZFS_EXIT(zfsvfs);
+		return (EXDEV);
+	}
+
+	tdzp = VTOZ(tdvp);
+top:
+	szp = NULL;
+	tzp = NULL;
+	zl = NULL;
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
+	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Lock source and target directory entries.  To prevent deadlock,
+	 * a lock ordering must be defined.  We lock the directory with
+	 * the smallest object id first, or if it's a tie, the one with
+	 * the lexically first name.
+	 */
+	if (sdzp->z_id < tdzp->z_id) {
+		cmp = -1;
+	} else if (sdzp->z_id > tdzp->z_id) {
+		cmp = 1;
+	} else {
+		cmp = strcmp(snm, tnm);
+		if (cmp == 0) {
+			/*
+			 * POSIX: "If the old argument and the new argument
+			 * both refer to links to the same existing file,
+			 * the rename() function shall return successfully
+			 * and perform no other action."
+			 */
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+	}
+	if (cmp < 0) {
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+	} else {
+		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+	}
+
+	if (serr) {
+		/*
+		 * Source entry invalid or not there.
+		 */
+		if (!terr) {
+			zfs_dirent_unlock(tdl);
+			if (tzp)
+				VN_RELE(ZTOV(tzp));
+		}
+		if (strcmp(snm, "..") == 0)
+			serr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (serr);
+	}
+	if (terr) {
+		zfs_dirent_unlock(sdl);
+		VN_RELE(ZTOV(szp));
+		if (strcmp(tnm, "..") == 0)
+			terr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (terr);
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+
+	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+		goto out;
+
+	if (ZTOV(szp)->v_type == VDIR) {
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
+			goto out;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if (ZTOV(szp)->v_type == VDIR) {
+			if (ZTOV(tzp)->v_type != VDIR) {
+				error = ENOTDIR;
+				goto out;
+			}
+		} else {
+			if (ZTOV(tzp)->v_type == VDIR) {
+				error = EISDIR;
+				goto out;
+			}
+		}
+		/*
+		 * POSIX dictates that when the source and target
+		 * entries refer to the same file object, rename
+		 * must do nothing and exit without error.
+		 */
+		if (szp->z_id == tzp->z_id) {
+			error = 0;
+			goto out;
+		}
+	}
+
+	vnevent_rename_src(ZTOV(szp));
+	if (tzp)
+		vnevent_rename_dest(ZTOV(tzp));
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
+	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
+	if (sdzp != tdzp) {
+		dmu_tx_hold_zap(tx, sdzp->z_id, 1);
+		dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
+	} else {
+		dmu_tx_hold_zap(tx, sdzp->z_id, 2);
+	}
+	if (tzp) {
+		dmu_tx_hold_bonus(tx, tzp->z_id);	/* nlink changes */
+	}
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		if (zl != NULL)
+			zfs_rename_unlock(&zl);
+		zfs_dirent_unlock(sdl);
+		zfs_dirent_unlock(tdl);
+		VN_RELE(ZTOV(szp));
+		if (tzp)
+			VN_RELE(ZTOV(tzp));
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		if (error == 0) {
+			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			ASSERT(error == 0);
+			seq = zfs_log_rename(zilog, tx, TX_RENAME,
+			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+		}
+	}
+
+	dmu_tx_commit(tx);
+out:
+	if (zl != NULL)
+		zfs_rename_unlock(&zl);
+
+	zfs_dirent_unlock(sdl);
+	zfs_dirent_unlock(tdl);
+
+	VN_RELE(ZTOV(szp));
+	if (tzp)
+		VN_RELE(ZTOV(tzp));
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dvp	- Directory to contain new symbolic link.
+ *		link	- Name for new symlink entry.
+ *		vap	- Attributes of new entry.
+ *		target	- Target path of new symlink.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+static int
+zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	zoid;
+	int		len = strlen(link);
+	int		error;
+
+	ASSERT(vap->va_type == VLNK);
+
+	ZFS_ENTER(zfsvfs);
+top:
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (ENAMETOOLONG);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_bonus(tx, dzp->z_id);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+
+	/*
+	 * Create a new object for the symlink.
+	 * Put the link content into bonus buffer if it will fit;
+	 * otherwise, store it just like any other file data.
+	 */
+	zoid = 0;
+	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+		if (len != 0)
+			bcopy(link, zp->z_phys + 1, len);
+	} else {
+		dmu_buf_t *dbp;
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		error = zfs_grow_blocksize(zp, len, tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error)
+			goto out;
+
+		dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+		dmu_buf_will_dirty(dbp, tx);
+
+		ASSERT3U(len, <=, dbp->db_size);
+		bcopy(link, dbp->db_data, len);
+		dmu_buf_rele(dbp);
+	}
+	zp->z_phys->zp_size = len;
+
+	/*
+	 * Insert the new object into the directory.
+	 */
+	(void) zfs_link_create(dl, zp, tx, ZNEW);
+out:
+	if (error == 0)
+		seq = zfs_log_symlink(zilog, tx, TX_SYMLINK,
+		    dzp, zp, name, link);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	VN_RELE(ZTOV(zp));
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ *	IN:	vp	- vnode of symbolic link.
+ *		uoip	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- structure to contain the link path.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	size_t		bufsz;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	bufsz = (size_t)zp->z_phys->zp_size;
+	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
+		error = uiomove(zp->z_phys + 1,
+		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+	} else {
+		dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
+		if ((error = dmu_buf_read_canfail(dbp)) != 0) {
+			dmu_buf_rele(dbp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		error = uiomove(dbp->db_data,
+		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+		dmu_buf_rele(dbp);
+	}
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ *	IN:	tdvp	- Directory to contain new entry.
+ *		svp	- vnode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	tdvp - ctime|mtime updated
+ *	 svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(tdvp);
+	znode_t		*tzp, *szp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	vnode_t		*realvp;
+	int		error;
+
+	ASSERT(tdvp->v_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (VOP_REALVP(svp, &realvp) == 0)
+		svp = realvp;
+
+	if (svp->v_vfsp != tdvp->v_vfsp) {
+		ZFS_EXIT(zfsvfs);
+		return (EXDEV);
+	}
+
+	szp = VTOZ(svp);
+top:
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
+	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (svp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
+	    secpolicy_basic_link(cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, szp->z_id);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_create(dl, szp, tx, 0);
+
+	if (error == 0)
+		seq = zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * zfs_null_putapage() is used when the file system has been force
+ * unmounted. It just drops the pages.
+ */
+/* ARGSUSED */
+static int
+zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+		size_t *lenp, int flags, cred_t *cr)
+{
+	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+		size_t *lenp, int flags, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	dmu_tx_t	*tx;
+	u_offset_t	off;
+	ssize_t		len;
+	caddr_t		va;
+	int		err;
+
+top:
+	rw_enter(&zp->z_grow_lock, RW_READER);
+
+	off = pp->p_offset;
+	len = MIN(PAGESIZE, zp->z_phys->zp_size - off);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		goto out;
+	}
+
+	va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+
+	dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+
+	ppmapout(va);
+
+	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+	seq = zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_grow_lock);
+
+	pvn_write_done(pp, B_WRITE | flags);
+	if (offp)
+		*offp = off;
+	if (lenp)
+		*lenp = len;
+
+	zil_commit(zilog, seq, 0);
+out:
+	return (err);
+}
+
+/*
+ * Copy the portion of the file indicated from pages into the file.
+ * The pages are stored in a page list attached to the files vnode.
+ *
+ *	IN:	vp	- vnode of file to push page data to.
+ *		off	- position in file to put data.
+ *		len	- amount of data to write.
+ *		flags	- flags to control the operation.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated
+ */
+static int
+zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	page_t		*pp;
+	size_t		io_len;
+	u_offset_t	io_off;
+	int		error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	if (len == 0) {
+		/*
+		 * Search the entire vp list for pages >= off.
+		 */
+		error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
+		    flags, cr);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (off > zp->z_phys->zp_size) {
+		/* past end of file */
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	len = MIN(len, zp->z_phys->zp_size - off);
+
+	io_off = off;
+	while (io_off < off + len) {
+		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
+			pp  = page_lookup(vp, io_off,
+				(flags & (B_INVAL | B_FREE)) ?
+					SE_EXCL : SE_SHARED);
+		} else {
+			pp = page_lookup_nowait(vp, io_off,
+				(flags & B_FREE) ? SE_EXCL : SE_SHARED);
+		}
+
+		if (pp != NULL && pvn_getdirty(pp, flags)) {
+			int err;
+
+			/*
+			 * Found a dirty page to push
+			 */
+			if (err =
+			    zfs_putapage(vp, pp, &io_off, &io_len, flags, cr))
+				error = err;
+		} else {
+			io_len = PAGESIZE;
+		}
+		io_off += io_len;
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+void
+zfs_inactive(vnode_t *vp, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	rw_enter(&zfsvfs->z_um_lock, RW_READER);
+	if (zfsvfs->z_unmounted2) {
+		ASSERT(zp->z_dbuf_held == 0);
+
+		if (vn_has_cached_data(vp)) {
+			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
+			    B_INVAL, cr);
+		}
+
+		vp->v_count = 0; /* count arrives as 1 */
+		zfs_znode_free(zp);
+		rw_exit(&zfsvfs->z_um_lock);
+		VFS_RELE(zfsvfs->z_vfs);
+		return;
+	}
+
+	/*
+	 * Attempt to push any data in the page cache.  If this fails
+	 * we will get kicked out later in zfs_zinactive().
+	 */
+	if (vn_has_cached_data(vp))
+		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL, cr);
+
+	if (zp->z_atime_dirty && zp->z_reap == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_bonus(tx, zp->z_id);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			mutex_enter(&zp->z_lock);
+			zp->z_atime_dirty = 0;
+			mutex_exit(&zp->z_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+
+	zfs_zinactive(zp);
+	rw_exit(&zfsvfs->z_um_lock);
+}
+
+/*
+ * Bounds-check the seek operation.
+ *
+ *	IN:	vp	- vnode seeking within
+ *		ooff	- old file offset
+ *		noffp	- pointer to new file offset
+ *
+ *	RETURN:	0 if success
+ *		EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+static int
+zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
+{
+	if (vp->v_type == VDIR)
+		return (0);
+	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Pre-filter the generic locking function to trap attempts to place
+ * a mandatory lock on a memory mapped file.
+ */
+static int
+zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+    flk_callback_t *flk_cbp, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint_t cnt = 1;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * If file is being mapped, disallow frlock.  We set the mapcnt to
+	 * -1 here to signal that we are in the process of setting a lock.
+	 * This prevents a race with zfs_map().
+	 * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
+	 * we could be in the middle of zfs_map() and still call fs_frlock().
+	 * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
+	 * *is* manipulated).
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	    (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EAGAIN);
+	}
+	error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
+	ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * If we can't find a page in the cache, we will create a new page
+ * and fill it with file data.  For efficiency, we may try to fill
+ * multiple pages as once (klustering).
+ */
+static int
+zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
+    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+{
+	znode_t *zp = VTOZ(vp);
+	page_t *pp, *cur_pp;
+	objset_t *os = zp->z_zfsvfs->z_os;
+	caddr_t va;
+	u_offset_t io_off, total;
+	uint64_t oid = zp->z_id;
+	size_t io_len;
+	int err;
+
+	/*
+	 * If we are only asking for a single page don't bother klustering.
+	 */
+	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE ||
+	    off > zp->z_phys->zp_size) {
+		io_off = off;
+		io_len = PAGESIZE;
+		pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
+	} else {
+		/*
+		 * Try to fill a kluster of pages (a blocks worth).
+		 */
+		size_t klen;
+		u_offset_t koff;
+
+		if (!ISP2(zp->z_blksz)) {
+			/* Only one block in the file. */
+			klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+			koff = 0;
+		} else {
+			klen = plsz;
+			koff = P2ALIGN(off, (u_offset_t)klen);
+		}
+		if (klen > zp->z_phys->zp_size)
+			klen = P2ROUNDUP(zp->z_phys->zp_size,
+			    (uint64_t)PAGESIZE);
+		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
+			    &io_len, koff, klen, 0);
+	}
+	if (pp == NULL) {
+		/*
+		 * Some other thread entered the page before us.
+		 * Return to zfs_getpage to retry the lookup.
+		 */
+		*pl = NULL;
+		return (0);
+	}
+
+	/*
+	 * Fill the pages in the kluster.
+	 */
+	cur_pp = pp;
+	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+		ASSERT(io_off == cur_pp->p_offset);
+		va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+		err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+		ppmapout(va);
+		if (err) {
+			/* On error, toss the entire kluster */
+			pvn_read_done(pp, B_ERROR);
+			return (err);
+		}
+		cur_pp = cur_pp->p_next;
+	}
+out:
+	/*
+	 * Fill in the page list array from the kluster.  If
+	 * there are too many pages in the kluster, return
+	 * as many pages as possible starting from the desired
+	 * offset `off'.
+	 * NOTE: the page list will always be null terminated.
+	 */
+	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+
+	return (0);
+}
+
+/*
+ * Return pointers to the pages for the file region [off, off + len]
+ * in the pl array.  If plsz is greater than len, this function may
+ * also return page pointers from before or after the specified
+ * region (i.e. some region [off', off' + plsz]).  These additional
+ * pages are only returned if they are already in the cache, or were
+ * created as part of a klustered read.
+ *
+ *	IN:	vp	- vnode of file to get data from.
+ *		off	- position in file to get data from.
+ *		len	- amount of data to retrieve.
+ *		plsz	- length of provided page list.
+ *		seg	- segment to obtain pages for.
+ *		addr	- virtual address of fault.
+ *		rw	- mode of created pages.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	protp	- protection mode of created pages.
+ *		pl	- list of pages created.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+	enum seg_rw rw, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	page_t		*pp, **pl0 = pl;
+	int		cnt = 0, need_unlock = 0, err = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (protp)
+		*protp = PROT_ALL;
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	/* no faultahead (for now) */
+	if (pl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* can't fault past EOF */
+	if (off >= zp->z_phys->zp_size) {
+		ZFS_EXIT(zfsvfs);
+		return (EFAULT);
+	}
+
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the getpage.
+	 */
+	rw_enter(&zp->z_grow_lock, RW_READER);
+
+	/*
+	 * If we already own the lock, then we must be page faulting
+	 * in the middle of a write to this file (i.e., we are writing
+	 * to this file using data from a mapped region of the file).
+	 */
+	if (!rw_owner(&zp->z_map_lock)) {
+		rw_enter(&zp->z_map_lock, RW_WRITER);
+		need_unlock = TRUE;
+	}
+
+	/*
+	 * Loop through the requested range [off, off + len] looking
+	 * for pages.  If we don't find a page, we will need to create
+	 * a new page and fill it with data from the file.
+	 */
+	while (len > 0) {
+		if (plsz < PAGESIZE)
+			break;
+		if (pp = page_lookup(vp, off, SE_SHARED)) {
+			*pl++ = pp;
+			off += PAGESIZE;
+			addr += PAGESIZE;
+			len -= PAGESIZE;
+			plsz -= PAGESIZE;
+		} else {
+			err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
+			/*
+			 * klustering may have changed our region
+			 * to be block aligned.
+			 */
+			if (((pp = *pl) != 0) && (off != pp->p_offset)) {
+				int delta = off - pp->p_offset;
+				len += delta;
+				off -= delta;
+				addr -= delta;
+			}
+			while (*pl) {
+				pl++;
+				cnt++;
+				off += PAGESIZE;
+				addr += PAGESIZE;
+				plsz -= PAGESIZE;
+				if (len > PAGESIZE)
+					len -= PAGESIZE;
+				else
+					len = 0;
+			}
+		}
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Fill out the page array with any pages already in the cache.
+	 */
+	while (plsz > 0) {
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+		if (pp == NULL)
+			break;
+		*pl++ = pp;
+		off += PAGESIZE;
+		plsz -= PAGESIZE;
+	}
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+out:
+	if (err) {
+		/*
+		 * Release any pages we have locked.
+		 */
+		while (pl > pl0)
+			page_unlock(*--pl);
+	}
+	*pl = NULL;
+
+	if (need_unlock)
+		rw_exit(&zp->z_map_lock);
+	rw_exit(&zp->z_grow_lock);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+static int
+zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	segvn_crargs_t	vn_a;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (vp->v_flag & VNOMAP) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOSYS);
+	}
+
+	if (off < 0 || len > MAXOFFSET_T - off) {
+		ZFS_EXIT(zfsvfs);
+		return (ENXIO);
+	}
+
+	if (vp->v_type != VREG) {
+		ZFS_EXIT(zfsvfs);
+		return (ENODEV);
+	}
+
+	/*
+	 * If file is locked, disallow mapping.
+	 * XXX - since we don't modify z_mapcnt here, there is nothing
+	 * to stop a file lock being placed immediately after we complete
+	 * this check.
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+		if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
+			ZFS_EXIT(zfsvfs);
+			return (EAGAIN);
+		}
+	}
+
+	as_rangelock(as);
+	if ((flags & MAP_FIXED) == 0) {
+		map_addr(addrp, len, off, 1, flags);
+		if (*addrp == NULL) {
+			as_rangeunlock(as);
+			ZFS_EXIT(zfsvfs);
+			return (ENOMEM);
+		}
+	} else {
+		/*
+		 * User specified address - blow away any previous mappings
+		 */
+		(void) as_unmap(as, *addrp, len);
+	}
+
+	vn_a.vp = vp;
+	vn_a.offset = (u_offset_t)off;
+	vn_a.type = flags & MAP_TYPE;
+	vn_a.prot = prot;
+	vn_a.maxprot = maxprot;
+	vn_a.cred = cr;
+	vn_a.amp = NULL;
+	vn_a.flags = flags & ~MAP_TYPE;
+
+	error = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+	as_rangeunlock(as);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+	/*
+	 * XXX - shouldn't we be checking for file locks here?
+	 */
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
+{
+	atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	return (0);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	vp	- vnode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller [UNUSED].
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated
+ *
+ * NOTE: This function is limited in that it will only permit space to
+ *   be freed at the end of a file.  In essence, this function simply
+ *   allows one to set the file size.
+ */
+/* ARGSUSED */
+static int
+zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	dmu_tx_t	*tx;
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	if (error = convoff(vp, bfp, 0, offset)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len;
+	tx = dmu_tx_create(zfsvfs->z_os);
+	/*
+	 * Grab the grow_lock to serialize this change with
+	 * respect to other file size changes.
+	 */
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	rw_enter(&zp->z_grow_lock, RW_WRITER);
+	if (off + len > zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz &&
+	    off >= zp->z_phys->zp_size) {
+		/*
+		 * We are increasing the length of the file,
+		 * and this may mean a block size increase.
+		 */
+		dmu_tx_hold_write(tx, zp->z_id, 0,
+		    MIN(off + len, zfsvfs->z_max_blksz));
+	} else if (off < zp->z_phys->zp_size) {
+		/*
+		 * If len == 0, we are truncating the file.
+		 */
+		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+	}
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_freesp(zp, off, len, flag, tx, cr);
+
+	if (error == 0) {
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+	}
+
+	rw_exit(&zp->z_grow_lock);
+
+	dmu_tx_commit(tx);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i;
+
+	ZFS_ENTER(zfsvfs);
+
+	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+	if (fidp->fid_len < size) {
+		fidp->fid_len = size;
+		return (ENOSPC);
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	if (size == LONG_FID_LEN) {
+		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
+		zfid_long_t	*zlfid;
+
+		zlfid = (zfid_long_t *)fidp;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+		/* XXX - this should be the generation number for the objset */
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			zlfid->zf_setgen[i] = 0;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
+{
+	znode_t		*zp, *xzp;
+	zfsvfs_t	*zfsvfs;
+	zfs_dirlock_t	*dl;
+	int		error;
+
+	switch (cmd) {
+	case _PC_LINK_MAX:
+		*valp = ULONG_MAX;
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*valp = 64;
+		return (0);
+
+	case _PC_XATTR_EXISTS:
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		*valp = 0;
+		error = zfs_dirent_lock(&dl, zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
+		if (error == 0) {
+			zfs_dirent_unlock(dl);
+			if (!zfs_dirempty(xzp))
+				*valp = 1;
+			VN_RELE(ZTOV(xzp));
+		} else if (error == ENOENT) {
+			/*
+			 * If there aren't extended attributes, it's the
+			 * same as having zero of them.
+			 */
+			error = 0;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+
+	case _PC_ACL_ENABLED:
+		*valp = _ACL_ACE_ENABLED;
+		return (0);
+
+	case _PC_MIN_HOLE_SIZE:
+		*valp = (ulong_t)SPA_MINBLOCKSIZE;
+		return (0);
+
+	default:
+		return (fs_pathconf(vp, cmd, valp, cr));
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_getacl(zp, vsecp, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_setacl(zp, vsecp, cr);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Predeclare these here so that the compiler assumes that
+ * this is an "old style" function declaration that does
+ * not include arguments => we won't get type mismatch errors
+ * in the initializations that follow.
+ */
+static int zfs_inval();
+static int zfs_isdir();
+
+static int
+zfs_inval()
+{
+	return (EINVAL);
+}
+
+static int
+zfs_isdir()
+{
+	return (EISDIR);
+}
+/*
+ * Directory vnode operations template
+ */
+vnodeops_t *zfs_dvnodeops;
+const fs_operation_def_t zfs_dvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_READ, zfs_isdir,
+	VOPNAME_WRITE, zfs_isdir,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_CREATE, zfs_create,
+	VOPNAME_REMOVE, zfs_remove,
+	VOPNAME_LINK, zfs_link,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_MKDIR, zfs_mkdir,
+	VOPNAME_RMDIR, zfs_rmdir,
+	VOPNAME_READDIR, zfs_readdir,
+	VOPNAME_SYMLINK, zfs_symlink,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	NULL, NULL
+};
+
+/*
+ * Regular file vnode operations template
+ */
+vnodeops_t *zfs_fvnodeops;
+const fs_operation_def_t zfs_fvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_READ, zfs_read,
+	VOPNAME_WRITE, zfs_write,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p)zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_FRLOCK, zfs_frlock,
+	VOPNAME_SPACE, zfs_space,
+	VOPNAME_GETPAGE, zfs_getpage,
+	VOPNAME_PUTPAGE, zfs_putpage,
+	VOPNAME_MAP, (fs_generic_func_p) zfs_map,
+	VOPNAME_ADDMAP, (fs_generic_func_p) zfs_addmap,
+	VOPNAME_DELMAP, zfs_delmap,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Symbolic link vnode operations template
+ */
+vnodeops_t *zfs_symvnodeops;
+const fs_operation_def_t zfs_symvnodeops_template[] = {
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_READLINK, zfs_readlink,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Extended attribute directory vnode operations template
+ *	This template is identical to the directory vnodes
+ *	operation template except for restricted operations:
+ *		VOP_MKDIR()
+ *		VOP_SYMLINK()
+ * Note that there are other restrictions embedded in:
+ *	zfs_create()	- restrict type to VREG
+ *	zfs_link()	- no links into/out of attribute space
+ *	zfs_rename()	- no moves into/out of attribute space
+ */
+vnodeops_t *zfs_xdvnodeops;
+const fs_operation_def_t zfs_xdvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_CREATE, zfs_create,
+	VOPNAME_REMOVE, zfs_remove,
+	VOPNAME_LINK, zfs_link,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_MKDIR, zfs_inval,
+	VOPNAME_RMDIR, zfs_rmdir,
+	VOPNAME_READDIR, zfs_readdir,
+	VOPNAME_SYMLINK, zfs_inval,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Error vnode operations template
+ */
+vnodeops_t *zfs_evnodeops;
+const fs_operation_def_t zfs_evnodeops_template[] = {
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 0000000000..1ff11e29b8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,1286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/atomic.h>
+#include <vm/pvn.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/fs/zfs.h>
+
+struct kmem_cache *znode_cache = NULL;
+
+/*
+ * Note that znodes can be on one of 2 states:
+ *	ZCACHE_mru	- recently used, currently cached
+ *	ZCACHE_mfu	- frequently used, currently cached
+ * When there are no active references to the znode, they
+ * are linked onto one of the lists in zcache.  These are the
+ * only znodes that can be evicted.
+ */
+
+typedef struct zcache_state {
+	list_t	list;	/* linked list of evictable znodes in state */
+	uint64_t lcnt;	/* total number of znodes in the linked list */
+	uint64_t cnt;	/* total number of all znodes in this state */
+	uint64_t hits;
+	kmutex_t mtx;
+} zcache_state_t;
+
+/* The 2 states: */
+static zcache_state_t ZCACHE_mru;
+static zcache_state_t ZCACHE_mfu;
+
+static struct zcache {
+	zcache_state_t	*mru;
+	zcache_state_t	*mfu;
+	uint64_t	p;		/* Target size of mru */
+	uint64_t	c;		/* Target size of cache */
+	uint64_t	c_max;		/* Maximum target cache size */
+
+	/* performance stats */
+	uint64_t	missed;
+	uint64_t	evicted;
+	uint64_t	skipped;
+} zcache;
+
+void zcache_kmem_reclaim(void);
+
+#define	ZCACHE_MINTIME (hz>>4) /* 62 ms */
+
+/*
+ * Move the supplied znode to the indicated state.  The mutex
+ * for the znode must be held by the caller.
+ */
+static void
+zcache_change_state(zcache_state_t *new_state, znode_t *zp)
+{
+	/* ASSERT(MUTEX_HELD(hash_mtx)); */
+	ASSERT(zp->z_active);
+
+	if (zp->z_zcache_state) {
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+	}
+	atomic_add_64(&new_state->cnt, 1);
+	zp->z_zcache_state = new_state;
+}
+
+static void
+zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp->z_phys);
+	ASSERT(zp->z_dbuf_held);
+
+	zp->z_dbuf_held = 0;
+	mutex_exit(&zp->z_lock);
+	dmu_buf_rele(zp->z_dbuf);
+	mutex_exit(hash_mtx);
+	VFS_RELE(zfsvfs->z_vfs);
+}
+
+/*
+ * Evict znodes from list until we've removed the specified number
+ */
+static void
+zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
+{
+	int znodes_evicted = 0;
+	znode_t *zp, *zp_prev;
+	kmutex_t *hash_mtx;
+
+	ASSERT(state == zcache.mru || state == zcache.mfu);
+
+	mutex_enter(&state->mtx);
+
+	for (zp = list_tail(&state->list); zp; zp = zp_prev) {
+		zp_prev = list_prev(&state->list, zp);
+		if (zfsvfs && zp->z_zfsvfs != zfsvfs)
+			continue;
+		hash_mtx = ZFS_OBJ_MUTEX(zp);
+		if (mutex_tryenter(hash_mtx)) {
+			mutex_enter(&zp->z_lock);
+			list_remove(&zp->z_zcache_state->list, zp);
+			zp->z_zcache_state->lcnt -= 1;
+			ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+			atomic_add_64(&zp->z_zcache_state->cnt, -1);
+			zp->z_zcache_state = NULL;
+			zp->z_zcache_access = 0;
+			/* drops z_lock and hash_mtx */
+			zfs_zcache_evict(zp, hash_mtx);
+			znodes_evicted += 1;
+			atomic_add_64(&zcache.evicted, 1);
+			if (znodes_evicted >= cnt)
+				break;
+		} else {
+			atomic_add_64(&zcache.skipped, 1);
+		}
+	}
+	mutex_exit(&state->mtx);
+
+	if (znodes_evicted < cnt)
+		dprintf("only evicted %lld znodes from %x",
+		    (longlong_t)znodes_evicted, state);
+}
+
+static void
+zcache_adjust(void)
+{
+	uint64_t mrucnt = zcache.mru->lcnt;
+	uint64_t mfucnt = zcache.mfu->lcnt;
+	uint64_t p = zcache.p;
+	uint64_t c = zcache.c;
+
+	if (mrucnt > p)
+		zcache_evict_state(zcache.mru, mrucnt - p, NULL);
+
+	if (mfucnt > 0 && mrucnt + mfucnt > c) {
+		int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
+		zcache_evict_state(zcache.mfu, toevict, NULL);
+	}
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+zfs_zcache_flush(zfsvfs_t *zfsvfs)
+{
+	zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
+	zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
+}
+
+static void
+zcache_try_grow(int64_t cnt)
+{
+	int64_t size;
+	/*
+	 * If we're almost to the current target cache size,
+	 * increment the target cache size
+	 */
+	size = zcache.mru->lcnt + zcache.mfu->lcnt;
+	if ((zcache.c - size) <= 1) {
+		atomic_add_64(&zcache.c, cnt);
+		if (zcache.c > zcache.c_max)
+			zcache.c = zcache.c_max;
+		else if (zcache.p + cnt < zcache.c)
+			atomic_add_64(&zcache.p, cnt);
+	}
+}
+
+/*
+ * This routine is called whenever a znode is accessed.
+ */
+static void
+zcache_access(znode_t *zp, kmutex_t *hash_mtx)
+{
+	ASSERT(MUTEX_HELD(hash_mtx));
+
+	if (zp->z_zcache_state == NULL) {
+		/*
+		 * This znode is not in the cache.
+		 * Add the new znode to the MRU state.
+		 */
+
+		zcache_try_grow(1);
+
+		ASSERT(zp->z_zcache_access == 0);
+		zp->z_zcache_access = lbolt;
+		zcache_change_state(zcache.mru, zp);
+		mutex_exit(hash_mtx);
+
+		/*
+		 * If we are using less than 2/3 of our total target
+		 * cache size, bump up the target size for the MRU
+		 * list.
+		 */
+		if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
+			zcache.p = zcache.mru->lcnt + zcache.c/6;
+		}
+
+		zcache_adjust();
+
+		atomic_add_64(&zcache.missed, 1);
+	} else if (zp->z_zcache_state == zcache.mru) {
+		/*
+		 * This znode has been "accessed" only once so far,
+		 * Move it to the MFU state.
+		 */
+		if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
+			/*
+			 * More than 125ms have passed since we
+			 * instantiated this buffer.  Move it to the
+			 * most frequently used state.
+			 */
+			zp->z_zcache_access = lbolt;
+			zcache_change_state(zcache.mfu, zp);
+		}
+		atomic_add_64(&zcache.mru->hits, 1);
+		mutex_exit(hash_mtx);
+	} else {
+		ASSERT(zp->z_zcache_state == zcache.mfu);
+		/*
+		 * This buffer has been accessed more than once.
+		 * Keep it in the MFU state.
+		 */
+		atomic_add_64(&zcache.mfu->hits, 1);
+		mutex_exit(hash_mtx);
+	}
+}
+
+static void
+zcache_init(void)
+{
+	zcache.c = 20;
+	zcache.c_max = 50;
+
+	zcache.mru = &ZCACHE_mru;
+	zcache.mfu = &ZCACHE_mfu;
+
+	list_create(&zcache.mru->list, sizeof (znode_t),
+	    offsetof(znode_t, z_zcache_node));
+	list_create(&zcache.mfu->list, sizeof (znode_t),
+	    offsetof(znode_t, z_zcache_node));
+}
+
+static void
+zcache_fini(void)
+{
+	zfs_zcache_flush(NULL);
+
+	list_destroy(&zcache.mru->list);
+	list_destroy(&zcache.mfu->list);
+}
+
+/*ARGSUSED*/
+static void
+znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+{
+	znode_t *zp = user_ptr;
+	vnode_t *vp = ZTOV(zp);
+
+	if (vp->v_count == 0) {
+		vn_invalid(vp);
+		zfs_znode_free(zp);
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	zp->z_vnode = vn_alloc(KM_SLEEP);
+	zp->z_vnode->v_data = (caddr_t)zp;
+	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+	zp->z_dbuf_held = 0;
+	zp->z_dirlocks = 0;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(zp->z_dirlocks == 0);
+	mutex_destroy(&zp->z_lock);
+	rw_destroy(&zp->z_map_lock);
+	rw_destroy(&zp->z_grow_lock);
+	rw_destroy(&zp->z_append_lock);
+	mutex_destroy(&zp->z_acl_lock);
+
+	ASSERT(zp->z_dbuf_held == 0);
+	ASSERT(ZTOV(zp)->v_count == 0);
+	vn_free(ZTOV(zp));
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+
+	zcache_init();
+}
+
+void
+zfs_znode_fini(void)
+{
+	zcache_fini();
+
+	/*
+	 * Cleanup vfs & vnode ops
+	 */
+	zfs_remove_op_tables();
+
+	/*
+	 * Cleanup zcache
+	 */
+	if (znode_cache)
+		kmem_cache_destroy(znode_cache);
+	znode_cache = NULL;
+}
+
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+
+void
+zfs_remove_op_tables()
+{
+	/*
+	 * Remove vfs ops
+	 */
+	ASSERT(zfsfstype);
+	(void) vfs_freevfsops_by_type(zfsfstype);
+	zfsfstype = 0;
+
+	/*
+	 * Remove vnode ops
+	 */
+	if (zfs_dvnodeops)
+		vn_freevnodeops(zfs_dvnodeops);
+	if (zfs_fvnodeops)
+		vn_freevnodeops(zfs_fvnodeops);
+	if (zfs_symvnodeops)
+		vn_freevnodeops(zfs_symvnodeops);
+	if (zfs_xdvnodeops)
+		vn_freevnodeops(zfs_xdvnodeops);
+	if (zfs_evnodeops)
+		vn_freevnodeops(zfs_evnodeops);
+
+	zfs_dvnodeops = NULL;
+	zfs_fvnodeops = NULL;
+	zfs_symvnodeops = NULL;
+	zfs_xdvnodeops = NULL;
+	zfs_evnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+	int error;
+
+	/*
+	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+	 * In this case we just return as the ops vectors are already set up.
+	 */
+	if (zfs_dvnodeops)
+		return (0);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+	    &zfs_dvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+	    &zfs_fvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+	    &zfs_symvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+	    &zfs_xdvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+	    &zfs_evnodeops);
+
+	return (error);
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ *	incore "master" object.  Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+	extern int zfsfstype;
+
+	objset_t	*os = zfsvfs->z_os;
+	uint64_t	zoid;
+	uint64_t	version = ZFS_VERSION;
+	int		i, error;
+	dmu_object_info_t doi;
+	dmu_objset_stats_t *stats;
+
+	*zpp = NULL;
+
+	/*
+	 * XXX - hack to auto-create the pool root filesystem at
+	 * the first attempted mount.
+	 */
+	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+		dmu_tx_t *tx = dmu_tx_create(os);
+
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		ASSERT3U(error, ==, 0);
+		zfs_create_fs(os, cr, tx);
+		dmu_tx_commit(tx);
+	}
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
+		return (EINVAL);
+	} else if (version != ZFS_VERSION) {
+		(void) printf("Mismatched versions:  File system "
+		    "is version %lld on-disk format, which is "
+		    "incompatible with this software version %lld!",
+		    (u_longlong_t)version, ZFS_VERSION);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP);
+	dmu_objset_stats(os, stats);
+	ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0);
+	zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid;
+	zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) |
+	    zfsfstype & 0xFF;
+	kmem_free(stats, sizeof (dmu_objset_stats_t));
+	stats = NULL;
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
+		return (EINVAL);
+	}
+	ASSERT(zoid != 0);
+	zfsvfs->z_root = zoid;
+
+	/*
+	 * Create the per mount vop tables.
+	 */
+
+	/*
+	 * Initialize zget mutex's
+	 */
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	error = zfs_zget(zfsvfs, zoid, zpp);
+	if (error)
+		return (error);
+	ASSERT3U((*zpp)->z_id, ==, zoid);
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
+		return (EINVAL);
+	}
+
+	zfsvfs->z_dqueue = zoid;
+
+	/*
+	 * Initialize delete head structure
+	 * Thread(s) will be started/stopped via
+	 * readonly_changed_cb() depending
+	 * on whether this is rw/ro mount.
+	 */
+	list_create(&zfsvfs->z_delete_head.z_znodes,
+	    sizeof (znode_t), offsetof(znode_t, z_list_node));
+
+	return (0);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+
+	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+	ASSERT(zp->z_dirlocks == NULL);
+
+	zp->z_phys = db->db_data;
+	zp->z_zfsvfs = zfsvfs;
+	zp->z_active = 1;
+	zp->z_reap = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_dbuf_held = 0;
+	zp->z_mapcnt = 0;
+	zp->z_last_itx = 0;
+	zp->z_dbuf = db;
+	zp->z_id = obj_num;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+
+	bzero(&zp->z_zcache_node, sizeof (list_node_t));
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	vp = ZTOV(zp);
+	vn_reinit(vp);
+
+	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
+	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+	switch (vp->v_type) {
+	case VDIR:
+		if (zp->z_phys->zp_flags & ZFS_XATTR) {
+			vn_setops(vp, zfs_xdvnodeops);
+			vp->v_flag |= V_XATTRDIR;
+		} else
+			vn_setops(vp, zfs_dvnodeops);
+		break;
+	case VBLK:
+	case VCHR:
+		vp->v_rdev = (dev_t)zp->z_phys->zp_rdev;
+		/*FALLTHROUGH*/
+	case VFIFO:
+	case VSOCK:
+	case VDOOR:
+		vn_setops(vp, zfs_fvnodeops);
+		break;
+	case VREG:
+		vp->v_flag |= VMODSORT;
+		vn_setops(vp, zfs_fvnodeops);
+		break;
+	case VLNK:
+		vn_setops(vp, zfs_symvnodeops);
+		break;
+	default:
+		vn_setops(vp, zfs_evnodeops);
+		break;
+	}
+
+	return (zp);
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp)
+{
+	znode_t		*nzp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	dmu_buf_t	*db = zp->z_dbuf;
+
+	mutex_enter(&zp->z_lock);
+
+	nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
+
+	/*
+	 * there should be no
+	 * concurrent zgets on this object.
+	 */
+	ASSERT3P(nzp, ==, NULL);
+
+	/*
+	 * Slap on VROOT if we are the root znode
+	 */
+	if (zp->z_id == zfsvfs->z_root) {
+		ZTOV(zp)->v_flag |= VROOT;
+	}
+
+	zp->z_zcache_state = NULL;
+	zp->z_zcache_access = 0;
+
+	ASSERT(zp->z_dbuf_held == 0);
+	zp->z_dbuf_held = 1;
+	VFS_HOLD(zfsvfs->z_vfs);
+	mutex_exit(&zp->z_lock);
+	vn_exists(ZTOV(zp));
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_XATTR	- new object is an attribute
+ *			  IS_REPLAY	- intent log replay
+ *
+ *	OUT:	oid	- ID of created object
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
+	uint_t flag, znode_t **zpp, int bonuslen)
+{
+	dmu_buf_t	*dbp;
+	znode_phys_t	*pzp;
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	timestruc_t	now;
+	uint64_t	gen;
+	int		err;
+
+	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+		*oid = vap->va_nodeid;
+		flag |= IS_REPLAY;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+	} else {
+		*oid = 0;
+		gethrestime(&now);
+		gen = dmu_tx_get_txg(tx);
+	}
+
+	/*
+	 * Create a new DMU object.
+	 */
+	if (vap->va_type == VDIR) {
+		if (flag & IS_REPLAY) {
+			err = zap_create_claim(zfsvfs->z_os, *oid,
+			    DMU_OT_DIRECTORY_CONTENTS,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			ASSERT3U(err, ==, 0);
+		} else {
+			*oid = zap_create(zfsvfs->z_os,
+			    DMU_OT_DIRECTORY_CONTENTS,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+		}
+	} else {
+		if (flag & IS_REPLAY) {
+			err = dmu_object_claim(zfsvfs->z_os, *oid,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			ASSERT3U(err, ==, 0);
+		} else {
+			*oid = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+		}
+	}
+	dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+	dmu_buf_will_dirty(dbp, tx);
+
+	/*
+	 * Initialize the znode physical data to zero.
+	 */
+	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
+	bzero(dbp->db_data, dbp->db_size);
+	pzp = dbp->db_data;
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_phys = pzp;
+		dzp->z_id = *oid;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp->z_phys->zp_flags & ZFS_XATTR)
+		flag |= IS_XATTR;
+
+	if (vap->va_type == VBLK || vap->va_type == VCHR) {
+		pzp->zp_rdev = vap->va_rdev;
+	}
+
+	if (vap->va_type == VDIR) {
+		pzp->zp_size = 2;		/* contents ("." and "..") */
+		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+	}
+
+	pzp->zp_parent = dzp->z_id;
+	if (flag & IS_XATTR)
+		pzp->zp_flags |= ZFS_XATTR;
+
+	pzp->zp_gen = gen;
+
+	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+	if (vap->va_mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+	}
+
+	if (vap->va_mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+	}
+
+	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
+
+	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+
+	if (zpp) {
+		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+
+		mutex_enter(hash_mtx);
+		zfs_znode_dmu_init(zp);
+		zcache_access(zp, hash_mtx);
+		*zpp = zp;
+	} else {
+		ZTOV(zp)->v_count = 0;
+		dmu_buf_rele(dbp);
+		zfs_znode_free(zp);
+	}
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+
+	*zpp = NULL;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
+	if (db == NULL) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
+		dmu_buf_rele(db);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EINVAL);
+	}
+	dmu_buf_read(db);
+
+	ASSERT(db->db_object == obj_num);
+	ASSERT(db->db_offset == -1);
+	ASSERT(db->db_data != NULL);
+
+	zp = dmu_buf_get_user(db);
+
+	if (zp != NULL) {
+		mutex_enter(&zp->z_lock);
+
+		ASSERT3U(zp->z_id, ==, obj_num);
+		if (zp->z_reap) {
+			dmu_buf_rele(db);
+			mutex_exit(&zp->z_lock);
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+			return (ENOENT);
+		} else if (zp->z_dbuf_held) {
+			dmu_buf_rele(db);
+		} else {
+			zp->z_dbuf_held = 1;
+			VFS_HOLD(zfsvfs->z_vfs);
+		}
+
+		if (zp->z_active == 0) {
+			zp->z_active = 1;
+			if (list_link_active(&zp->z_zcache_node)) {
+				mutex_enter(&zp->z_zcache_state->mtx);
+				list_remove(&zp->z_zcache_state->list, zp);
+				zp->z_zcache_state->lcnt -= 1;
+				mutex_exit(&zp->z_zcache_state->mtx);
+			}
+		}
+		VN_HOLD(ZTOV(zp));
+		mutex_exit(&zp->z_lock);
+		zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+		*zpp = zp;
+		return (0);
+	}
+
+	/*
+	 * Not found create new znode/vnode
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
+	ASSERT3U(zp->z_id, ==, obj_num);
+	zfs_znode_dmu_init(zp);
+	zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+	*zpp = zp;
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+		error = dmu_object_free(zfsvfs->z_os,
+		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+		ASSERT3U(error, ==, 0);
+	}
+	if (zp->z_zcache_state) {
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+	}
+	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
+	ASSERT3U(error, ==, 0);
+	zp->z_dbuf_held = 0;
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+	dmu_buf_rele(zp->z_dbuf);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	vnode_t	*vp = ZTOV(zp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t z_id = zp->z_id;
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode
+	 */
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&vp->v_lock);
+	vp->v_count--;
+	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
+		/*
+		 * If the hold count is greater than zero, somebody has
+		 * obtained a new reference on this znode while we were
+		 * processing it here, so we are done.  If we still have
+		 * mapped pages then we are also done, since we don't
+		 * want to inactivate the znode until the pages get pushed.
+		 *
+		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
+		 * this seems like it would leave the znode hanging with
+		 * no chance to go inactive...
+		 */
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&zp->z_lock);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+		return;
+	}
+	mutex_exit(&vp->v_lock);
+	zp->z_active = 0;
+
+	/*
+	 * If this was the last reference to a file with no links,
+	 * remove the file from the file system.
+	 */
+	if (zp->z_reap) {
+		mutex_exit(&zp->z_lock);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+		zp->z_zcache_state = NULL;
+		/* XATTR files are not put on the delete queue */
+		if (zp->z_phys->zp_flags & ZFS_XATTR) {
+			zfs_rmnode(zp);
+		} else {
+			mutex_enter(&zfsvfs->z_delete_head.z_mutex);
+			list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp);
+			zfsvfs->z_delete_head.z_znode_count++;
+			cv_broadcast(&zfsvfs->z_delete_head.z_cv);
+			mutex_exit(&zfsvfs->z_delete_head.z_mutex);
+		}
+		VFS_RELE(zfsvfs->z_vfs);
+		return;
+	}
+
+	/*
+	 * If the file system for this znode is no longer mounted,
+	 * evict the znode now, don't put it in the cache.
+	 */
+	if (zfsvfs->z_unmounted1) {
+		zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
+		return;
+	}
+
+	/* put znode on evictable list */
+	mutex_enter(&zp->z_zcache_state->mtx);
+	list_insert_head(&zp->z_zcache_state->list, zp);
+	zp->z_zcache_state->lcnt += 1;
+	mutex_exit(&zp->z_zcache_state->mtx);
+	mutex_exit(&zp->z_lock);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_remove(&zfsvfs->z_all_znodes, zp);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	kmem_cache_free(znode_cache, zp);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+	timestruc_t	now;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+
+	gethrestime(&now);
+
+	if (tx) {
+		dmu_buf_will_dirty(zp->z_dbuf, tx);
+		zp->z_atime_dirty = 0;
+		zp->z_seq++;
+	} else {
+		zp->z_atime_dirty = 1;
+	}
+
+	if (flag & AT_ATIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+	if (flag & AT_MTIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+
+	if (flag & AT_CTIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ *  1 - Only the ACCESS time is ever updated outside of a transaction.
+ *  2 - Multiple consecutive updates will be collapsed into a single
+ *	znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+	mutex_enter(&zp->z_lock);
+	zfs_time_stamper_locked(zp, flag, tx);
+	mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file.  This may involve migrating data
+ * from the bonus buffer into a data block (when we grow beyond the
+ * bonus buffer data area).
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+int
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	ASSERT(rw_write_held(&zp->z_grow_lock));
+
+	if (size <= zp->z_blksz)
+		return (0);
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+		return (0);
+
+	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+	    size, 0, tx);
+	if (error == ENOTSUP)
+		return (0);
+	ASSERT3U(error, ==, 0);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+
+	return (0);
+}
+
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage().  E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+    int flags, cred_t *cr)
+{
+	ASSERT(0);
+	return (0);
+}
+
+/*
+ * Free space in a file.  Currently, this function only
+ * supports freeing space at the end of the file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		from	- start of section to free.
+ *		len	- length of section to free (0 => to EOF).
+ *		flag	- current file open mode flags.
+ *		tx	- open transaction.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
+	cred_t *cr)
+{
+	vnode_t *vp = ZTOV(zp);
+	uint64_t size = zp->z_phys->zp_size;
+	uint64_t end = from + len;
+	int have_grow_lock, error;
+
+	have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (len == 0 && size == from) {
+		return (0);
+	}
+
+	/*
+	 * Check for any locks in the region to be freed.
+	 */
+	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+		uint64_t	start;
+
+		if (size > from)
+			start = from;
+		else
+			start = size;
+		if (error = chklock(vp, FWRITE, start, 0, flag, NULL))
+			return (error);
+	}
+
+	if (end > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+		uint64_t new_blksz;
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			ASSERT(!ISP2(zp->z_blksz));
+			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+		} else {
+			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+		}
+		error = zfs_grow_blocksize(zp, new_blksz, tx);
+		ASSERT(error == 0);
+	}
+	if (end > size || len == 0)
+		zp->z_phys->zp_size = end;
+	if (from > size)
+		return (0);
+
+	if (have_grow_lock)
+		rw_downgrade(&zp->z_grow_lock);
+	/*
+	 * Clear any mapped pages in the truncated region.
+	 */
+	rw_enter(&zp->z_map_lock, RW_WRITER);
+	if (vn_has_cached_data(vp)) {
+		page_t *pp;
+		uint64_t start = from & PAGEMASK;
+		int off = from & PAGEOFFSET;
+
+		if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
+			/*
+			 * We need to zero a partial page.
+			 */
+			pagezero(pp, off, PAGESIZE - off);
+			start += PAGESIZE;
+			page_unlock(pp);
+		}
+		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
+		    B_INVAL | B_TRUNC, cr);
+		ASSERT(error == 0);
+	}
+	rw_exit(&zp->z_map_lock);
+
+	if (!have_grow_lock)
+		rw_enter(&zp->z_grow_lock, RW_READER);
+
+	if (len == 0)
+		len = -1;
+	else if (end > size)
+		len = size - from;
+	dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+
+	if (!have_grow_lock)
+		rw_exit(&zp->z_grow_lock);
+
+	return (0);
+}
+
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+{
+	zfsvfs_t	zfsvfs;
+	uint64_t	moid, doid, roid = 0;
+	uint64_t	version = ZFS_VERSION;
+	int		error;
+	znode_t		*rootzp = NULL;
+	vnode_t		*vp;
+	vattr_t		vattr;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+
+	error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create a delete queue.
+	 */
+	doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/vnode/zfsvfs
+	 * to allow zfs_mknode to work.
+	 */
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = 0;
+	vattr.va_gid = 3;
+
+	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	rootzp->z_zfsvfs = &zfsvfs;
+	rootzp->z_active = 1;
+	rootzp->z_reap = 0;
+	rootzp->z_atime_dirty = 0;
+	rootzp->z_dbuf_held = 0;
+
+	vp = ZTOV(rootzp);
+	vn_reinit(vp);
+	vp->v_type = VDIR;
+
+	bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+	zfsvfs.z_os = os;
+	zfsvfs.z_assign = TXG_NOWAIT;
+	zfsvfs.z_parent = &zfsvfs;
+
+	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
+	ASSERT3U(rootzp->z_id, ==, roid);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+	ASSERT(error == 0);
+
+	ZTOV(rootzp)->v_count = 0;
+	kmem_cache_free(znode_cache, rootzp);
+}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
new file mode 100644
index 0000000000..1adc8ca3df
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -0,0 +1,1242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * 	- ZIL header
+ * 	- ZIL blocks
+ * 	- ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * These global ZIL switches affect all pools
+ */
+int zil_disable = 0;	/* disable intent logging */
+int zil_always = 0;	/* make every transaction synchronous */
+int zil_purge = 0;	/* at pool open, just throw everything away */
+int zil_noflush = 0;	/* don't flush write cache buffers on disks */
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+	const dva_t *dva1 = x1;
+	const dva_t *dva2 = x2;
+
+	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+		return (-1);
+	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+		return (1);
+
+	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+		return (-1);
+	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+		return (1);
+
+	return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+	    offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+	zil_dva_node_t *zn;
+	void *cookie = NULL;
+
+	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(zn, sizeof (zil_dva_node_t));
+
+	avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+	zil_dva_node_t *zn;
+	avl_index_t where;
+
+	if (avl_find(t, dva, &where) != NULL)
+		return (EEXIST);
+
+	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+	zn->zn_dva = *dva;
+	avl_insert(t, zn, where);
+
+	return (0);
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
+{
+	uint64_t blksz = BP_GET_LSIZE(bp);
+	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
+	zio_cksum_t cksum;
+	int error;
+
+	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
+	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+	if (error) {
+		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
+		    zilog, bp, error);
+		return (error);
+	}
+
+	if (BP_SHOULD_BYTESWAP(bp))
+		byteswap_uint64_array(buf, blksz);
+
+	/*
+	 * Sequence numbers should be... sequential.  The checksum verifier for
+	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
+	 */
+	cksum = bp->blk_cksum;
+	cksum.zc_word[3]++;
+	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
+		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
+		return (ESTALE);
+	}
+
+	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
+		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
+		return (ENOENT);
+	}
+
+	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
+		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
+		return (EOVERFLOW);
+	}
+
+	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
+
+	return (0);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+void
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+	blkptr_t blk;
+	char *lrbuf, *lrp;
+	zil_trailer_t *ztp;
+	int reclen, error;
+
+	blk = zilog->zl_header->zh_log;
+	if (BP_IS_HOLE(&blk))
+		return;
+
+	/*
+	 * Starting at the block pointed to by zh_log we read the log chain.
+	 * For each block in the chain we strongly check that block to
+	 * ensure its validity.  We stop when an invalid block is found.
+	 * For each block pointer in the chain we call parse_blk_func().
+	 * For each record in each valid block we call parse_lr_func().
+	 */
+	zil_dva_tree_init(&zilog->zl_dva_tree);
+	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	for (;;) {
+		error = zil_read_log_block(zilog, &blk, lrbuf);
+
+		if (parse_blk_func != NULL)
+			parse_blk_func(zilog, &blk, arg, txg);
+
+		if (error)
+			break;
+
+		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+		blk = ztp->zit_next_blk;
+
+		if (parse_lr_func == NULL)
+			continue;
+
+		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+			lr_t *lr = (lr_t *)lrp;
+			reclen = lr->lrc_reclen;
+			ASSERT3U(reclen, >=, sizeof (lr_t));
+			parse_lr_func(zilog, lr, arg, txg);
+		}
+	}
+	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+	zil_dva_tree_fini(&zilog->zl_dva_tree);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+	spa_t *spa = zilog->zl_spa;
+	int err;
+
+	dprintf_bp(bp, "first_txg %llu: ", first_txg);
+
+	/*
+	 * Claim log block if not already committed and not already claimed.
+	 */
+	if (bp->blk_birth >= first_txg &&
+	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+		ASSERT(err == 0);
+	}
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+	}
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+	/*
+	 * If we previously claimed it, we need to free it.
+	 */
+	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		if (bp->blk_birth >= claim_txg &&
+		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+			(void) arc_free(NULL, zilog->zl_spa,
+			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+		}
+	}
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	uint64_t txg;
+	dmu_tx_t *tx;
+	blkptr_t blk;
+	int error;
+
+	ASSERT(zilog->zl_header->zh_claim_txg == 0);
+	ASSERT(zilog->zl_header->zh_replay_seq == 0);
+
+	/*
+	 * Initialize the log header block.
+	 */
+	tx = dmu_tx_create(zilog->zl_os);
+	(void) dmu_tx_assign(tx, TXG_WAIT);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	/*
+	 * Allocate the first log block and assign its checksum verifier.
+	 */
+	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+	    ZIL_MIN_BLKSZ, &blk, txg);
+	if (error == 0) {
+		ZIO_SET_CHECKSUM(&blk.blk_cksum,
+		    spa_get_random(-1ULL), spa_get_random(-1ULL),
+		    dmu_objset_id(zilog->zl_os), 1ULL);
+
+		/*
+		 * Allocate a log write buffer (lwb) for the first log block.
+		 */
+		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+		lwb->lwb_zilog = zilog;
+		lwb->lwb_blk = blk;
+		lwb->lwb_nused = 0;
+		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+		lwb->lwb_max_txg = txg;
+		lwb->lwb_seq = 0;
+		lwb->lwb_state = UNWRITTEN;
+		mutex_enter(&zilog->zl_lock);
+		list_insert_tail(&zilog->zl_lwb_list, lwb);
+		mutex_exit(&zilog->zl_lock);
+	}
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(zilog->zl_dmu_pool, txg);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ */
+void
+zil_destroy(zilog_t *zilog)
+{
+	dmu_tx_t *tx;
+	uint64_t txg;
+
+	mutex_enter(&zilog->zl_destroy_lock);
+
+	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
+		mutex_exit(&zilog->zl_destroy_lock);
+		return;
+	}
+
+	tx = dmu_tx_create(zilog->zl_os);
+	(void) dmu_tx_assign(tx, TXG_WAIT);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
+	    zilog->zl_header->zh_claim_txg);
+	zilog->zl_destroy_txg = txg;
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+	mutex_exit(&zilog->zl_destroy_lock);
+}
+
+void
+zil_claim(char *osname, void *txarg)
+{
+	dmu_tx_t *tx = txarg;
+	uint64_t first_txg = dmu_tx_get_txg(tx);
+	zilog_t *zilog;
+	zil_header_t *zh;
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+	if (error) {
+		cmn_err(CE_WARN, "can't process intent log for %s", osname);
+		return;
+	}
+
+	zilog = dmu_objset_zil(os);
+	zh = zilog->zl_header;
+
+	/*
+	 * Claim all log blocks if we haven't already done so.
+	 */
+	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+		zh->zh_claim_txg = first_txg;
+		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
+		    tx, first_txg);
+		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+	}
+	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+	dmu_objset_close(os);
+}
+
+void
+zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq)
+{
+	zil_vdev_t *zv;
+
+	if (zil_noflush)
+		return;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+	zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+	zv->vdev = vdev;
+	zv->seq = seq;
+	list_insert_tail(&zilog->zl_vdev_list, zv);
+}
+
+
+void
+zil_flush_vdevs(zilog_t *zilog, uint64_t seq)
+{
+	vdev_t *vd;
+	zil_vdev_t *zv, *zv2;
+	zio_t *zio;
+	spa_t *spa;
+	uint64_t vdev;
+
+	if (zil_noflush)
+		return;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+	spa = zilog->zl_spa;
+	zio = NULL;
+
+	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL &&
+	    zv->seq <= seq) {
+		vdev = zv->vdev;
+		list_remove(&zilog->zl_vdev_list, zv);
+		kmem_free(zv, sizeof (zil_vdev_t));
+
+		/*
+		 * remove all chained entries <= seq with same vdev
+		 */
+		zv = list_head(&zilog->zl_vdev_list);
+		while (zv && zv->seq <= seq) {
+			zv2 = list_next(&zilog->zl_vdev_list, zv);
+			if (zv->vdev == vdev) {
+				list_remove(&zilog->zl_vdev_list, zv);
+				kmem_free(zv, sizeof (zil_vdev_t));
+			}
+			zv = zv2;
+		}
+
+		/* flush the write cache for this vdev */
+		mutex_exit(&zilog->zl_lock);
+		if (zio == NULL)
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+		vd = vdev_lookup_top(spa, vdev);
+		ASSERT(vd);
+		(void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+		mutex_enter(&zilog->zl_lock);
+	}
+
+	/*
+	 * Wait for all the flushes to complete.  Not all devices actually
+	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+	 */
+	if (zio != NULL)
+		(void) zio_wait(zio);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+	lwb_t *prev;
+	lwb_t *lwb = zio->io_private;
+	zilog_t *zilog = lwb->lwb_zilog;
+	uint64_t max_seq;
+
+	/*
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync.
+	 */
+	txg_rele_to_sync(&lwb->lwb_txgh);
+
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_buf = NULL;
+	if (zio->io_error) {
+		zilog->zl_log_error = B_TRUE;
+		mutex_exit(&zilog->zl_lock);
+		cv_broadcast(&zilog->zl_cv_seq);
+		return;
+	}
+
+	prev = list_prev(&zilog->zl_lwb_list, lwb);
+	if (prev && prev->lwb_state != SEQ_COMPLETE) {
+		/* There's an unwritten buffer in the chain before this one */
+		lwb->lwb_state = SEQ_INCOMPLETE;
+		mutex_exit(&zilog->zl_lock);
+		return;
+	}
+
+	max_seq = lwb->lwb_seq;
+	lwb->lwb_state = SEQ_COMPLETE;
+	/*
+	 * We must also follow up the chain for already written buffers
+	 * to see if we can set zl_ss_seq even higher.
+	 */
+	while (lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+		if (lwb->lwb_state != SEQ_INCOMPLETE)
+			break;
+		lwb->lwb_state = SEQ_COMPLETE;
+		/* lwb_seq will be zero if we've written an empty buffer */
+		if (lwb->lwb_seq) {
+			ASSERT3U(max_seq, <, lwb->lwb_seq);
+			max_seq = lwb->lwb_seq;
+		}
+	}
+	zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+	mutex_exit(&zilog->zl_lock);
+	cv_broadcast(&zilog->zl_cv_seq);
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+	lwb_t *nlwb;
+	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+	uint64_t txg;
+	uint64_t zil_blksz;
+	int error;
+
+	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+	/*
+	 * Allocate the next block and save its address in this block
+	 * before writing it in order to establish the log chain.
+	 * Note that if the allocation of nlwb synced before we wrote
+	 * the block that points at it (lwb), we'd leak it if we crashed.
+	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+	 */
+	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+	txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+	/*
+	 * Pick a ZIL blocksize based upon the size of the outstanding
+	 * in-memory transactions, or if none the same size as the
+	 * last block.
+	 */
+	if (zilog->zl_itx_list_sz) {
+		zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp);
+		zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
+		if (zil_blksz > ZIL_MAX_BLKSZ)
+			zil_blksz = ZIL_MAX_BLKSZ;
+		zilog->zl_prev_blk_sz = zil_blksz;
+	} else {
+		zil_blksz = zilog->zl_prev_blk_sz;
+	}
+
+	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+	    zil_blksz, &ztp->zit_next_blk, txg);
+	if (error) {
+		txg_rele_to_sync(&lwb->lwb_txgh);
+		return (NULL);
+	}
+
+	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+	ztp->zit_nused = lwb->lwb_nused;
+	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
+	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
+
+	/*
+	 * Allocate a new log write buffer (lwb).
+	 */
+	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+	nlwb->lwb_zilog = zilog;
+	nlwb->lwb_blk = ztp->zit_next_blk;
+	nlwb->lwb_nused = 0;
+	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+	nlwb->lwb_max_txg = txg;
+	nlwb->lwb_seq = 0;
+	nlwb->lwb_state = UNWRITTEN;
+
+	/*
+	 * Put new lwb at the end of the log chain,
+	 * and record the vdev for later flushing
+	 */
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_lwb_list, nlwb);
+	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))),
+	    lwb->lwb_seq);
+	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * write the old log block
+	 */
+	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
+	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
+	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+
+	return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+	lr_t *lrc = &itx->itx_lr; /* common log record */
+	uint64_t seq = lrc->lrc_seq;
+	uint64_t txg = lrc->lrc_txg;
+	uint64_t reclen = lrc->lrc_reclen;
+	int error;
+
+	if (lwb == NULL)
+		return (NULL);
+	ASSERT(lwb->lwb_buf != NULL);
+
+	/*
+	 * If it's a write, fetch the data or get its blkptr as appropriate.
+	 */
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		if (txg > spa_freeze_txg(zilog->zl_spa))
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+		if (!itx->itx_data_copied &&
+		    (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) {
+			if (error != ENOENT && error != EALREADY) {
+				txg_wait_synced(zilog->zl_dmu_pool, txg);
+				mutex_enter(&zilog->zl_lock);
+				zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+				zil_add_vdev(zilog,
+				    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))),
+				    seq);
+				mutex_exit(&zilog->zl_lock);
+				return (lwb);
+			}
+			mutex_enter(&zilog->zl_lock);
+			zil_add_vdev(zilog,
+			    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq);
+			mutex_exit(&zilog->zl_lock);
+			return (lwb);
+		}
+	}
+
+	/*
+	 * If this record won't fit in the current log block, start a new one.
+	 */
+	if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+		lwb = zil_lwb_write_start(zilog, lwb);
+		if (lwb == NULL)
+			return (NULL);
+		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+			mutex_enter(&zilog->zl_lock);
+			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+			mutex_exit(&zilog->zl_lock);
+			return (lwb);
+		}
+	}
+
+	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+	lwb->lwb_nused += reclen;
+	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+	ASSERT3U(lwb->lwb_seq, <, seq);
+	lwb->lwb_seq = seq;
+	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+	return (lwb);
+}
+
+itx_t *
+zil_itx_create(int txtype, size_t lrsize)
+{
+	itx_t *itx;
+
+	lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t));
+
+	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+	itx->itx_lr.lrc_txtype = txtype;
+	itx->itx_lr.lrc_reclen = lrsize;
+	itx->itx_lr.lrc_seq = 0;	/* defensive */
+
+	return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+	uint64_t seq;
+
+	ASSERT(itx->itx_lr.lrc_seq == 0);
+
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_itx_list, itx);
+	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+	mutex_exit(&zilog->zl_lock);
+
+	return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+	uint64_t max_seq = 0;
+	itx_t *itx;
+
+	mutex_enter(&zilog->zl_lock);
+	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+		list_remove(&zilog->zl_itx_list, itx);
+		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+		ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq);
+		max_seq = itx->itx_lr.lrc_seq;
+		kmem_free(itx, offsetof(itx_t, itx_lr)
+		    + itx->itx_lr.lrc_reclen);
+	}
+	if (max_seq > zilog->zl_ss_seq) {
+		zilog->zl_ss_seq = max_seq;
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_clean(zilog_t *zilog)
+{
+	/*
+	 * Check for any log blocks that can be freed.
+	 * Log blocks are only freed when the log block allocation and
+	 * log records contained within are both known to be committed.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	if (list_head(&zilog->zl_itx_list) != NULL)
+		(void) taskq_dispatch(zilog->zl_clean_taskq,
+		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)
+{
+	uint64_t txg;
+	uint64_t max_seq;
+	uint64_t reclen;
+	itx_t *itx;
+	lwb_t *lwb;
+	spa_t *spa;
+
+	if (zilog == NULL || seq == 0 ||
+	    ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always))
+		return;
+
+	spa = zilog->zl_spa;
+	mutex_enter(&zilog->zl_lock);
+
+	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
+
+	for (;;) {
+		if (zilog->zl_ss_seq >= seq) {	/* already on stable storage */
+			cv_signal(&zilog->zl_cv_write);
+			mutex_exit(&zilog->zl_lock);
+			return;
+		}
+
+		if (zilog->zl_writer == B_FALSE) /* no one writing, do it */
+			break;
+
+		cv_wait(&zilog->zl_cv_write, &zilog->zl_lock);
+	}
+
+	zilog->zl_writer = B_TRUE;
+	max_seq = 0;
+
+	if (zilog->zl_suspend) {
+		lwb = NULL;
+	} else {
+		lwb = list_tail(&zilog->zl_lwb_list);
+		if (lwb == NULL) {
+			mutex_exit(&zilog->zl_lock);
+			zil_create(zilog);
+			mutex_enter(&zilog->zl_lock);
+			lwb = list_tail(&zilog->zl_lwb_list);
+		}
+	}
+
+	/*
+	 * Loop through in-memory log transactions filling log blocks,
+	 * until we reach the given sequence number and there's no more
+	 * room in the write buffer.
+	 */
+	for (;;) {
+		itx = list_head(&zilog->zl_itx_list);
+		if (itx == NULL)
+			break;
+
+		reclen = itx->itx_lr.lrc_reclen;
+		if ((itx->itx_lr.lrc_seq > seq) &&
+		    ((lwb == NULL) || (lwb->lwb_nused + reclen >
+		    ZIL_BLK_DATA_SZ(lwb))))
+			break;
+
+		list_remove(&zilog->zl_itx_list, itx);
+		txg = itx->itx_lr.lrc_txg;
+		ASSERT(txg);
+
+		mutex_exit(&zilog->zl_lock);
+		if (txg > spa_last_synced_txg(spa) ||
+		    txg > spa_freeze_txg(spa))
+			lwb = zil_lwb_commit(zilog, itx, lwb);
+		else
+			max_seq = itx->itx_lr.lrc_seq;
+		kmem_free(itx, offsetof(itx_t, itx_lr)
+		    + itx->itx_lr.lrc_reclen);
+		mutex_enter(&zilog->zl_lock);
+		zilog->zl_itx_list_sz -= reclen;
+	}
+
+	mutex_exit(&zilog->zl_lock);
+
+	/* write the last block out */
+	if (lwb != NULL && lwb->lwb_nused != 0)
+		lwb = zil_lwb_write_start(zilog, lwb);
+
+	/* wake up others waiting to start a write */
+	mutex_enter(&zilog->zl_lock);
+	zilog->zl_writer = B_FALSE;
+	cv_signal(&zilog->zl_cv_write);
+
+	if (max_seq > zilog->zl_ss_seq) {
+		zilog->zl_ss_seq = max_seq;
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	/*
+	 * Wait if necessary for our seq to be committed.
+	 */
+	if (lwb) {
+		while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
+			cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+		zil_flush_vdevs(zilog, seq);
+	}
+	if (zilog->zl_log_error || lwb == NULL) {
+		zilog->zl_log_error = 0;
+		max_seq = zilog->zl_itx_seq;
+		mutex_exit(&zilog->zl_lock);
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+		mutex_enter(&zilog->zl_lock);
+		zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = zilog->zl_spa;
+	lwb_t *lwb;
+
+	ASSERT(zilog->zl_stop_sync == 0);
+
+	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+	if (zilog->zl_destroy_txg == txg) {
+		bzero(zilog->zl_header, sizeof (zil_header_t));
+		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		zilog->zl_destroy_txg = 0;
+	}
+
+	mutex_enter(&zilog->zl_lock);
+	for (;;) {
+		lwb = list_head(&zilog->zl_lwb_list);
+		if (lwb == NULL) {
+			mutex_exit(&zilog->zl_lock);
+			return;
+		}
+		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+			break;
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_free_blk(spa, &lwb->lwb_blk, txg);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	zilog->zl_header->zh_log = lwb->lwb_blk;
+	mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+	    sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+	kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+	zilog_t *zilog;
+
+	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+	zilog->zl_header = zh_phys;
+	zilog->zl_os = os;
+	zilog->zl_spa = dmu_objset_spa(os);
+	zilog->zl_dmu_pool = dmu_objset_pool(os);
+	zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ;
+
+	list_create(&zilog->zl_itx_list, sizeof (itx_t),
+	    offsetof(itx_t, itx_node));
+
+	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+	    offsetof(lwb_t, lwb_node));
+
+	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
+	    offsetof(zil_vdev_t, vdev_seq_node));
+
+	return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	zil_vdev_t *zv;
+
+	zilog->zl_stop_sync = 1;
+
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		list_remove(&zilog->zl_lwb_list, lwb);
+		if (lwb->lwb_buf != NULL)
+			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	list_destroy(&zilog->zl_lwb_list);
+
+	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+		list_remove(&zilog->zl_vdev_list, zv);
+		kmem_free(zv, sizeof (zil_vdev_t));
+	}
+	list_destroy(&zilog->zl_vdev_list);
+
+	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+	list_destroy(&zilog->zl_itx_list);
+
+	kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+
+	zilog->zl_get_data = get_data;
+	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+	    2, 2, TASKQ_PREPOPULATE);
+
+	return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+	taskq_destroy(zilog->zl_clean_taskq);
+	zilog->zl_clean_taskq = NULL;
+	zilog->zl_get_data = NULL;
+
+	zil_itx_clean(zilog);
+	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log.  While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+	lwb_t *lwb;
+
+	mutex_enter(&zilog->zl_lock);
+	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
+		mutex_exit(&zilog->zl_lock);
+		return (EBUSY);
+	}
+	zilog->zl_suspend++;
+	mutex_exit(&zilog->zl_lock);
+
+	zil_commit(zilog, UINT64_MAX, FSYNC);
+
+	mutex_enter(&zilog->zl_lock);
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		if (lwb->lwb_buf != NULL) {
+			/*
+			 * Wait for the buffer if it's in the process of
+			 * being written.
+			 */
+			if ((lwb->lwb_seq != 0) &&
+			    (lwb->lwb_state != SEQ_COMPLETE)) {
+				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+				continue;
+			}
+			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		}
+		list_remove(&zilog->zl_lwb_list, lwb);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
+
+	zil_destroy(zilog);
+
+	return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zilog->zl_suspend != 0);
+	zilog->zl_suspend--;
+	mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+	objset_t	*zr_os;
+	zil_replay_func_t **zr_replay;
+	void		*zr_arg;
+	void		(*zr_rm_sync)(void *arg);
+	uint64_t	*zr_txgp;
+	boolean_t	zr_byteswap;
+	char		*zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+	zil_replay_arg_t *zr = zra;
+	zil_header_t *zh = zilog->zl_header;
+	uint64_t reclen = lr->lrc_reclen;
+	uint64_t txtype = lr->lrc_txtype;
+	int pass, error;
+
+	if (zilog->zl_stop_replay)
+		return;
+
+	if (lr->lrc_txg < claim_txg)		/* already committed */
+		return;
+
+	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
+		return;
+
+	/*
+	 * Make a copy of the data so we can revise and extend it.
+	 */
+	bcopy(lr, zr->zr_lrbuf, reclen);
+
+	/*
+	 * The log block containing this lr may have been byteswapped
+	 * so that we can easily examine common fields like lrc_txtype.
+	 * However, the log is a mix of different data types, and only the
+	 * replay vectors know how to byteswap their records.  Therefore, if
+	 * the lr was byteswapped, undo it before invoking the replay vector.
+	 */
+	if (zr->zr_byteswap)
+		byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+	/*
+	 * If this is a TX_WRITE with a blkptr, suck in the data.
+	 */
+	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+		lr_write_t *lrw = (lr_write_t *)lr;
+		blkptr_t *wbp = &lrw->lr_blkptr;
+		uint64_t wlen = lrw->lr_length;
+		char *wbuf = zr->zr_lrbuf + reclen;
+
+		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
+			bzero(wbuf, wlen);
+		} else {
+			/*
+			 * A subsequent write may have overwritten this block,
+			 * in which case wbp may have been been freed and
+			 * reallocated, and our read of wbp may fail with a
+			 * checksum error.  We can safely ignore this because
+			 * the later write will provide the correct data.
+			 */
+			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
+			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+			    ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+		}
+	}
+
+	/*
+	 * We must now do two things atomically: replay this log record,
+	 * and update the log header to reflect the fact that we did so.
+	 * We use the DMU's ability to assign into a specific txg to do this.
+	 */
+	for (pass = 1; /* CONSTANTCONDITION */; pass++) {
+		uint64_t replay_txg;
+		dmu_tx_t *replay_tx;
+
+		replay_tx = dmu_tx_create(zr->zr_os);
+		error = dmu_tx_assign(replay_tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(replay_tx);
+			break;
+		}
+
+		replay_txg = dmu_tx_get_txg(replay_tx);
+
+		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+			error = EINVAL;
+		} else {
+			/*
+			 * On the first pass, arrange for the replay vector
+			 * to fail its dmu_tx_assign().  That's the only way
+			 * to ensure that those code paths remain well tested.
+			 */
+			*zr->zr_txgp = replay_txg - (pass == 1);
+			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+			    zr->zr_byteswap);
+			*zr->zr_txgp = TXG_NOWAIT;
+		}
+
+		if (error == 0) {
+			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+			    lr->lrc_seq;
+		}
+
+		dmu_tx_commit(replay_tx);
+
+		if (error != ERESTART)
+			break;
+
+		if (pass != 1)
+			txg_wait_open(spa_get_dsl(zilog->zl_spa),
+			    replay_txg + 1);
+
+		dprintf("pass %d, retrying\n", pass);
+	}
+
+	if (error) {
+		char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+		dmu_objset_name(zr->zr_os, name);
+		cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+		    "dataset %s, seq 0x%llx, txtype %llu\n",
+		    error, name,
+		    (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+		zilog->zl_stop_replay = 1;
+		kmem_free(name, MAXNAMELEN);
+	}
+
+	/*
+	 * The DMU's dnode layer doesn't see removes until the txg commits,
+	 * so a subsequent claim can spuriously fail with EEXIST.
+	 * To prevent this, if we might have removed an object,
+	 * wait for the delete thread to delete it, and then
+	 * wait for the transaction group to sync.
+	 */
+	if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
+		if (zr->zr_rm_sync != NULL)
+			zr->zr_rm_sync(zr->zr_arg);
+		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+	}
+}
+
+/*
+ * If this dataset has an intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+	zil_replay_arg_t zr;
+
+	zr.zr_os = os;
+	zr.zr_replay = replay_func;
+	zr.zr_arg = arg;
+	zr.zr_rm_sync = rm_sync;
+	zr.zr_txgp = txgp;
+	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
+	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+	/*
+	 * Wait for in-progress removes to sync before starting replay.
+	 */
+	if (rm_sync != NULL)
+		rm_sync(arg);
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+	zilog->zl_stop_replay = 0;
+	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+	    zilog->zl_header->zh_claim_txg);
+	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+	zil_destroy(zilog);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
new file mode 100644
index 0000000000..7323292859
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -0,0 +1,1698 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+static void zio_vdev_io_enter(zio_t *zio);
+static void zio_vdev_io_exit(zio_t *zio);
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+	0,	/* ZIO_PRIORITY_NOW		*/
+	0,	/* ZIO_PRIORITY_SYNC_READ	*/
+	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
+	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
+	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
+	4,	/* ZIO_PRIORITY_FREE		*/
+	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
+	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
+	10,	/* ZIO_PRIORITY_RESILVER	*/
+	20,	/* ZIO_PRIORITY_SCRUB		*/
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+	"null", "read", "write", "free", "claim", "ioctl" };
+
+/* At or above this size, force gang blocking - for testing */
+uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
+
+typedef struct zio_sync_pass {
+	int	zp_defer_free;		/* defer frees after this pass */
+	int	zp_dontcompress;	/* don't compress after this pass */
+	int	zp_rewrite;		/* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+	1,	/* zp_defer_free */
+	4,	/* zp_dontcompress */
+	1,	/* zp_rewrite */
+};
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+void
+zio_init(void)
+{
+	size_t c;
+
+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
+	 * for each quarter-power of 2.  For large buffers, we want
+	 * a cache for each multiple of PAGESIZE.
+	 */
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+		size_t p2 = size;
+		size_t align = 0;
+
+		while (p2 & (p2 - 1))
+			p2 &= p2 - 1;
+
+		if (size <= 4 * SPA_MINBLOCKSIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (P2PHASE(size, PAGESIZE) == 0) {
+			align = PAGESIZE;
+		} else if (P2PHASE(size, p2 >> 2) == 0) {
+			align = p2 >> 2;
+		}
+
+		if (align != 0) {
+			char name[30];
+			(void) sprintf(name, "zio_buf_%lu", size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, 0);
+			dprintf("creating cache for size %5lx align %5lx\n",
+			    size, align);
+		}
+	}
+
+	while (--c != 0) {
+		ASSERT(zio_buf_cache[c] != NULL);
+		if (zio_buf_cache[c - 1] == NULL)
+			zio_buf_cache[c - 1] = zio_buf_cache[c];
+	}
+}
+
+void
+zio_fini(void)
+{
+	size_t c;
+	kmem_cache_t *last_cache = NULL;
+
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		if (zio_buf_cache[c] != last_cache) {
+			last_cache = zio_buf_cache[c];
+			kmem_cache_destroy(zio_buf_cache[c]);
+		}
+		zio_buf_cache[c] = NULL;
+	}
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+	zt->zt_data = data;
+	zt->zt_size = size;
+	zt->zt_bufsize = bufsize;
+
+	zt->zt_next = zio->io_transform_stack;
+	zio->io_transform_stack = zt;
+
+	zio->io_data = data;
+	zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+	zio_transform_t *zt = zio->io_transform_stack;
+
+	*data = zt->zt_data;
+	*size = zt->zt_size;
+	*bufsize = zt->zt_bufsize;
+
+	zio->io_transform_stack = zt->zt_next;
+	kmem_free(zt, sizeof (zio_transform_t));
+
+	if ((zt = zio->io_transform_stack) != NULL) {
+		zio->io_data = zt->zt_data;
+		zio->io_size = zt->zt_size;
+	}
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+	void *data;
+	uint64_t size, bufsize;
+
+	ASSERT(zio->io_transform_stack != NULL);
+
+	zio_pop_transform(zio, &data, &size, &bufsize);
+	while (zio->io_transform_stack != NULL) {
+		zio_buf_free(data, bufsize);
+		zio_pop_transform(zio, &data, &size, &bufsize);
+	}
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+	zio_t *zio;
+
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+	zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+	zio->io_parent = pio;
+	zio->io_spa = spa;
+	zio->io_txg = txg;
+	if (bp != NULL) {
+		zio->io_bp = bp;
+		zio->io_bp_copy = *bp;
+		zio->io_bp_orig = *bp;
+		/* XXBP - Need to inherit this when it matters */
+		zio->io_dva_index = 0;
+	}
+	zio->io_done = done;
+	zio->io_private = private;
+	zio->io_type = type;
+	zio->io_priority = priority;
+	zio->io_stage = stage;
+	zio->io_pipeline = pipeline;
+	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
+	zio->io_timestamp = lbolt64;
+	zio->io_flags = flags;
+	zio_push_transform(zio, data, size, size);
+
+	if (pio == NULL) {
+		if (!(flags & ZIO_FLAG_CONFIG_HELD))
+			spa_config_enter(zio->io_spa, RW_READER);
+		zio->io_root = zio;
+	} else {
+		zio->io_root = pio->io_root;
+
+		mutex_enter(&pio->io_lock);
+		if (stage < ZIO_STAGE_READY)
+			pio->io_children_notready++;
+		pio->io_children_notdone++;
+		zio->io_sibling_next = pio->io_child;
+		zio->io_sibling_prev = NULL;
+		if (pio->io_child != NULL)
+			pio->io_child->io_sibling_prev = zio;
+		pio->io_child = zio;
+		mutex_exit(&pio->io_lock);
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+	int flags)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+	return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_t *zio;
+	dva_t *dva;
+
+	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+
+	/*
+	 * Work off our copy of the bp so the caller can free it.
+	 */
+	zio->io_bp = &zio->io_bp_copy;
+
+	bp = zio->io_bp;
+	dva = ZIO_GET_DVA(zio);
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		uint64_t csize = BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(csize);
+
+		zio_push_transform(zio, cbuf, csize, csize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+	}
+
+	if (DVA_GET_GANG(dva)) {
+		uint64_t gsize = SPA_GANGBLOCKSIZE;
+		void *gbuf = zio_buf_alloc(gsize);
+
+		zio_push_transform(zio, gbuf, gsize, gsize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+	    checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+	ASSERT(compress >= ZIO_COMPRESS_OFF &&
+	    compress < ZIO_COMPRESS_FUNCTIONS);
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = compress;
+
+	if (compress != ZIO_COMPRESS_OFF)
+		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
+
+	if (bp->blk_birth != txg) {
+		/* XXX the bp usually (always?) gets re-zeroed later */
+		BP_ZERO(bp);
+		BP_SET_LSIZE(bp, size);
+		BP_SET_PSIZE(bp, size);
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = ZIO_COMPRESS_OFF;
+
+	return (zio);
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	BP_ZERO(bp);
+	BP_SET_LSIZE(bp, size);
+	BP_SET_PSIZE(bp, size);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = ZIO_COMPRESS_OFF;
+
+	return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (txg == spa->spa_syncing_txg &&
+	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+		return (zio_null(pio, spa, NULL, NULL, 0));
+	}
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
+	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	/*
+	 * A claim is an allocation of a specific block.  Claims are needed
+	 * to support immediate writes in the intent log.  The issue is that
+	 * immediate writes contain committed data, but in a txg that was
+	 * *not* committed.  Upon opening the pool after an unclean shutdown,
+	 * the intent log claims all blocks that contain immediate write data
+	 * so that the SPA knows they're in use.
+	 *
+	 * All claims *must* be resolved in the first txg -- before the SPA
+	 * starts allocating blocks -- so that nothing is allocated twice.
+	 */
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+	ASSERT3U(spa_first_txg(spa), <=, txg);
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+	int c;
+
+	if (vd->vdev_children == 0) {
+		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+		    ZIO_TYPE_IOCTL, priority, flags,
+		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+		zio->io_vd = vd;
+		zio->io_cmd = cmd;
+	} else {
+		zio = zio_null(pio, spa, NULL, NULL, flags);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+			    done, private, priority, flags));
+	}
+
+	return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+    int checksum)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	ASSERT(size <= SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	BP_ZERO(bp);
+
+	BP_SET_LSIZE(bp, size);
+	BP_SET_PSIZE(bp, size);
+
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	if (checksum != ZIO_CHECKSUM_OFF)
+		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+
+	/*
+	 * Work off our copy of the bp so the caller can free it.
+	 */
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_block_tail_t *zbt;
+	void *wbuf;
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+
+	zio->io_bp = &zio->io_bp_copy;
+	zio->io_checksum = checksum;
+
+	if (zio_checksum_table[checksum].ci_zbt) {
+		/*
+		 * zbt checksums are necessarily destructive -- they modify
+		 * one word of the write buffer to hold the verifier/checksum.
+		 * Therefore, we must make a local copy in case the data is
+		 * being written to multiple places.
+		 */
+		wbuf = zio_buf_alloc(size);
+		bcopy(data, wbuf, size);
+		zio_push_transform(zio, wbuf, size, size);
+
+		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+		zbt->zbt_cksum = blk.blk_cksum;
+	}
+
+	return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.  It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+	void *data, uint64_t size, int type, int priority, int flags,
+	zio_done_func_t *done, void *private)
+{
+	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	zio_t *cio;
+
+	if (type == ZIO_TYPE_READ && bp != NULL) {
+		/*
+		 * If we have the bp, then the child should perform the
+		 * checksum and the parent need not.  This pushes error
+		 * detection as close to the leaves as possible and
+		 * eliminates redundant checksums in the interior nodes.
+		 */
+		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+	}
+
+	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+	    done, private, type, priority,
+	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+	    ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+
+	cio->io_vd = vd;
+	cio->io_offset = offset;
+
+	return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+	int error;
+
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+	zio->io_waiter = curthread;
+
+	zio_next_stage_async(zio);
+
+	mutex_enter(&zio->io_lock);
+	while (zio->io_stalled != ZIO_STAGE_DONE)
+		cv_wait(&zio->io_cv, &zio->io_lock);
+	mutex_exit(&zio->io_lock);
+
+	error = zio->io_error;
+
+	kmem_free(zio, sizeof (zio_t));
+
+	return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+	zio_next_stage_async(zio);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static void
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+	mutex_enter(&zio->io_lock);
+	if (*countp == 0) {
+		ASSERT(zio->io_stalled == 0);
+		mutex_exit(&zio->io_lock);
+		zio_next_stage(zio);
+	} else {
+		if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
+			zio_vdev_io_exit(zio);
+		zio->io_stalled = stage;
+		mutex_exit(&zio->io_lock);
+	}
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+	zio_t *pio = zio->io_parent;
+
+	mutex_enter(&pio->io_lock);
+	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		pio->io_error = zio->io_error;
+	if (--*countp == 0 && pio->io_stalled == stage) {
+		if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
+			zio_vdev_io_enter(pio);
+		pio->io_stalled = 0;
+		mutex_exit(&pio->io_lock);
+		zio_next_stage_async(pio);
+	} else {
+		mutex_exit(&pio->io_lock);
+	}
+}
+
+static void
+zio_wait_children_ready(zio_t *zio)
+{
+	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+	    &zio->io_children_notready);
+}
+
+void
+zio_wait_children_done(zio_t *zio)
+{
+	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+	    &zio->io_children_notdone);
+}
+
+static void
+zio_ready(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+
+	if (pio != NULL)
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+		    &pio->io_children_notready);
+
+	if (zio->io_bp)
+		zio->io_bp_copy = *zio->io_bp;
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	vdev_t *vd = zio->io_vd;
+	char blkbuf[300];
+
+	ASSERT(zio->io_children_notready == 0);
+	ASSERT(zio->io_children_notdone == 0);
+
+	if (bp != NULL) {
+		ASSERT(bp->blk_pad[0] == 0);
+		ASSERT(bp->blk_pad[1] == 0);
+		ASSERT(bp->blk_pad[2] == 0);
+		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+	}
+
+	if (vd != NULL)
+		vdev_stat_update(zio);
+
+	if (zio->io_error) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
+		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_error);
+	}
+
+	if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
+		    "partial write",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_numerrors);
+	}
+
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
+		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_error);
+	}
+
+	zio_clear_transform_stack(zio);
+
+	if (zio->io_done)
+		zio->io_done(zio);
+
+	ASSERT(zio->io_delegate_list == NULL);
+	ASSERT(zio->io_delegate_next == NULL);
+
+	if (pio != NULL) {
+		zio_t *next, *prev;
+
+		mutex_enter(&pio->io_lock);
+		next = zio->io_sibling_next;
+		prev = zio->io_sibling_prev;
+		if (next != NULL)
+			next->io_sibling_prev = prev;
+		if (prev != NULL)
+			prev->io_sibling_next = next;
+		if (pio->io_child == zio)
+			pio->io_child = next;
+		mutex_exit(&pio->io_lock);
+
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+		    &pio->io_children_notdone);
+	}
+
+	if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
+		spa_config_exit(spa);
+
+	if (zio->io_waiter != NULL) {
+		mutex_enter(&zio->io_lock);
+		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+		zio->io_stalled = zio->io_stage;
+		cv_broadcast(&zio->io_cv);
+		mutex_exit(&zio->io_lock);
+	} else {
+		kmem_free(zio, sizeof (zio_t));
+	}
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static void
+zio_write_compress(zio_t *zio)
+{
+	int compress = zio->io_compress;
+	blkptr_t *bp = zio->io_bp;
+	void *cbuf;
+	uint64_t lsize = zio->io_size;
+	uint64_t csize = lsize;
+	uint64_t cbufsize = 0;
+	int pass;
+
+	if (bp->blk_birth == zio->io_txg) {
+		/*
+		 * We're rewriting an existing block, which means we're
+		 * working on behalf of spa_sync().  For spa_sync() to
+		 * converge, it must eventually be the case that we don't
+		 * have to allocate new blocks.  But compression changes
+		 * the blocksize, which forces a reallocate, and makes
+		 * convergence take longer.  Therefore, after the first
+		 * few passes, stop compressing to ensure convergence.
+		 */
+		pass = spa_sync_pass(zio->io_spa);
+		if (pass > zio_sync_pass.zp_dontcompress)
+			compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT(BP_IS_HOLE(bp));
+		pass = 1;
+	}
+
+	if (compress != ZIO_COMPRESS_OFF)
+		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+		    &cbuf, &csize, &cbufsize))
+			compress = ZIO_COMPRESS_OFF;
+
+	if (compress != ZIO_COMPRESS_OFF && csize != 0)
+		zio_push_transform(zio, cbuf, csize, cbufsize);
+
+	/*
+	 * The final pass of spa_sync() must be all rewrites, but the first
+	 * few passes offer a trade-off: allocating blocks defers convergence,
+	 * but newly allocated blocks are sequential, so they can be written
+	 * to disk faster.  Therefore, we allow the first few passes of
+	 * spa_sync() to reallocate new blocks, but force rewrites after that.
+	 * There should only be a handful of blocks after pass 1 in any case.
+	 */
+	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+	    pass > zio_sync_pass.zp_rewrite) {
+		ASSERT(csize != 0);
+		ASSERT3U(BP_GET_COMPRESS(bp), ==, compress);
+		ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+	} else {
+		if (bp->blk_birth == zio->io_txg) {
+			ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+			bzero(bp, sizeof (blkptr_t));
+		}
+		if (csize == 0) {
+			BP_ZERO(bp);
+			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+		} else {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_PSIZE(bp, csize);
+			BP_SET_COMPRESS(bp, compress);
+			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
+		}
+	}
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_read_decompress(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	void *data;
+	uint64_t size;
+	uint64_t bufsize;
+	int compress = BP_GET_COMPRESS(bp);
+
+	ASSERT(compress != ZIO_COMPRESS_OFF);
+
+	zio_pop_transform(zio, &data, &size, &bufsize);
+
+	if (zio_decompress_data(compress, data, size,
+	    zio->io_data, zio->io_size))
+		zio->io_error = EIO;
+
+	zio_buf_free(data, bufsize);
+
+	zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_pipeline(zio_t *zio)
+{
+	/*
+	 * By default, the pipeline assumes that we're dealing with a gang
+	 * block.  If we're not, strip out any gang-specific stages.
+	 */
+	if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+		zio->io_pipeline &= ~ZIO_GANG_STAGES;
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+	if (BP_SHOULD_BYTESWAP(zio->io_bp))
+		byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static void
+zio_get_gang_header(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint64_t gsize = SPA_GANGBLOCKSIZE;
+	void *gbuf = zio_buf_alloc(gsize);
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_push_transform(zio, gbuf, gsize, gsize);
+
+	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+
+	zio_wait_children_done(zio);
+}
+
+static void
+zio_read_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize, loff, lsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		lsize = BP_GET_PSIZE(gbp);
+
+		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+		ASSERT3U(loff + lsize, <=, zio->io_size);
+		ASSERT(i < SPA_GBH_NBLKPTRS);
+		ASSERT(!BP_IS_HOLE(gbp));
+
+		zio_nowait(zio_read(zio, zio->io_spa, gbp,
+		    (char *)zio->io_data + loff, lsize, NULL, NULL,
+		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_wait_children_done(zio);
+}
+
+static void
+zio_rewrite_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize, loff, lsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	ASSERT(gsize == gbufsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		lsize = BP_GET_PSIZE(gbp);
+
+		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+		ASSERT3U(loff + lsize, <=, zio->io_size);
+		ASSERT(i < SPA_GBH_NBLKPTRS);
+		ASSERT(!BP_IS_HOLE(gbp));
+
+		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+		    NULL, NULL, zio->io_priority, zio->io_flags));
+	}
+
+	zio_push_transform(zio, gbh, gsize, gbufsize);
+	zio_wait_children_ready(zio);
+}
+
+static void
+zio_free_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+		if (BP_IS_HOLE(gbp))
+			continue;
+		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+		    gbp, NULL, NULL));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_next_stage(zio);
+}
+
+static void
+zio_claim_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		if (BP_IS_HOLE(gbp))
+			continue;
+		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+		    gbp, NULL, NULL));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_next_stage(zio);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+	dva_t *cdva = ZIO_GET_DVA(zio);
+	dva_t *pdva = ZIO_GET_DVA(pio);
+	uint64_t asize;
+
+	ASSERT(DVA_GET_GANG(pdva));
+
+	/* XXBP - Need to be careful here with multiple DVAs */
+	mutex_enter(&pio->io_lock);
+	asize = DVA_GET_ASIZE(pdva);
+	asize += DVA_GET_ASIZE(cdva);
+	DVA_SET_ASIZE(pdva, asize);
+	mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_allocate_gang_members(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	zio_gbh_phys_t *gbh;
+	uint64_t resid = zio->io_size;
+	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+	uint64_t gsize, loff, lsize;
+	uint32_t gbps_left;
+	int error;
+	int i;
+
+	gsize = SPA_GANGBLOCKSIZE;
+	gbps_left = SPA_GBH_NBLKPTRS;
+
+	error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+	if (error == ENOSPC)
+		panic("can't allocate gang block header");
+	ASSERT(error == 0);
+
+	DVA_SET_GANG(dva, 1);
+
+	bp->blk_birth = zio->io_txg;
+
+	gbh = zio_buf_alloc(gsize);
+	bzero(gbh, gsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size;
+	    loff += lsize, resid -= lsize, gbps_left--, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		dva = &gbp->blk_dva[0];
+
+		ASSERT(gbps_left != 0);
+		maxalloc = MIN(maxalloc, resid);
+
+		while (resid <= maxalloc * gbps_left) {
+			error = metaslab_alloc(zio->io_spa, maxalloc, dva,
+			    zio->io_txg);
+			if (error == 0)
+				break;
+			ASSERT3U(error, ==, ENOSPC);
+			if (maxalloc == SPA_MINBLOCKSIZE)
+				panic("really out of space");
+			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+		}
+
+		if (resid <= maxalloc * gbps_left) {
+			lsize = maxalloc;
+			BP_SET_LSIZE(gbp, lsize);
+			BP_SET_PSIZE(gbp, lsize);
+			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+			gbp->blk_birth = zio->io_txg;
+			zio_nowait(zio_rewrite(zio, zio->io_spa,
+			    zio->io_checksum, zio->io_txg, gbp,
+			    (char *)zio->io_data + loff, lsize,
+			    zio_write_allocate_gang_member_done, NULL,
+			    zio->io_priority, zio->io_flags));
+		} else {
+			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+			ASSERT(lsize != SPA_MINBLOCKSIZE);
+			zio_nowait(zio_write_allocate(zio, zio->io_spa,
+			    zio->io_checksum, zio->io_txg, gbp,
+			    (char *)zio->io_data + loff, lsize,
+			    zio_write_allocate_gang_member_done, NULL,
+			    zio->io_priority, zio->io_flags));
+		}
+	}
+
+	ASSERT(resid == 0 && loff == zio->io_size);
+
+	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+	zio_push_transform(zio, gbh, gsize, gsize);
+	zio_wait_children_done(zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static void
+zio_dva_allocate(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	int error;
+
+	ASSERT(BP_IS_HOLE(bp));
+
+	/* For testing, make some blocks above a certain size be gang blocks */
+	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
+		zio_write_allocate_gang_members(zio);
+		return;
+	}
+
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+
+	if (error == 0) {
+		bp->blk_birth = zio->io_txg;
+	} else if (error == ENOSPC) {
+		if (zio->io_size == SPA_MINBLOCKSIZE)
+			panic("really, truly out of space");
+		zio_write_allocate_gang_members(zio);
+		return;
+	} else {
+		zio->io_error = error;
+	}
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_free(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	metaslab_free(zio->io_spa, dva, zio->io_txg);
+
+	BP_ZERO(bp);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_claim(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_translate(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+
+	ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
+
+	zio->io_offset = offset;
+
+	if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
+		zio->io_error = ENXIO;
+	else if (offset + zio->io_size > zio->io_vd->vdev_asize)
+		zio->io_error = EOVERFLOW;
+
+	zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+static void
+zio_vdev_io_enter(zio_t *zio)
+{
+	vdev_t *tvd = zio->io_vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_io_lock);
+	ASSERT(zio->io_pending.list_next == NULL);
+	list_insert_tail(&tvd->vdev_io_pending, zio);
+	mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_exit(zio_t *zio)
+{
+	vdev_t *tvd = zio->io_vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_io_lock);
+	ASSERT(zio->io_pending.list_next != NULL);
+	list_remove(&tvd->vdev_io_pending, zio);
+	if (list_head(&tvd->vdev_io_pending) == NULL)
+		cv_broadcast(&tvd->vdev_io_cv);
+	mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_retry(void *vdarg)
+{
+	vdev_t *vd = vdarg;
+	zio_t *zio, *zq;
+
+	ASSERT(vd == vd->vdev_top);
+
+	/* XXPOLICY */
+	delay(hz);
+
+	vdev_reopen(vd, &zq);
+
+	while ((zio = zq) != NULL) {
+		zq = zio->io_retry_next;
+		zio->io_retry_next = NULL;
+		dprintf("async retry #%d for I/O to %s offset %llx\n",
+		    zio->io_retries, vdev_description(vd), zio->io_offset);
+		zio_next_stage_async(zio);
+	}
+}
+
+static void
+zio_vdev_io_setup(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	/* XXPOLICY */
+	if (zio->io_retries == 0 && vd == vd->vdev_top)
+		zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+		zio->io_flags |= ZIO_FLAG_PHYSICAL;
+		zio->io_offset += VDEV_LABEL_START_SIZE;
+	}
+
+	zio_vdev_io_enter(zio);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_vdev_io_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0);
+	ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0);
+	ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size);
+	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+	vdev_io_start(zio);
+
+	/* zio_next_stage_async() gets called from io completion interrupt */
+}
+
+static void
+zio_vdev_io_done(zio_t *zio)
+{
+	vdev_io_done(zio);
+}
+
+/* XXPOLICY */
+static boolean_t
+zio_should_retry(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	if (zio->io_error == 0)
+		return (B_FALSE);
+	if (zio->io_delegate_list != NULL)
+		return (B_FALSE);
+	if (vd != vd->vdev_top)
+		return (B_FALSE);
+	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+		return (B_FALSE);
+	if (zio->io_retries > 300 &&
+	    (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
+		return (B_FALSE);
+	if (zio->io_retries > 1 &&
+	    (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static void
+zio_vdev_io_assess(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+
+	zio_vdev_io_exit(zio);
+
+	ASSERT(zio->io_vsd == NULL);
+
+	/*
+	 * If the I/O failed, determine whether we should attempt to retry it.
+	 */
+	/* XXPOLICY */
+	if (zio_should_retry(zio)) {
+		zio_t *zq;
+
+		ASSERT(tvd == vd);
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
+
+		zio->io_retries++;
+		zio->io_error = 0;
+		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT;
+		/* XXPOLICY */
+		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+
+		dprintf("retry #%d for %s to %s offset %llx\n",
+		    zio->io_retries, zio_type_name[zio->io_type],
+		    vdev_description(vd), zio->io_offset);
+
+		/*
+		 * If this is the first retry, do it immediately.
+		 */
+		/* XXPOLICY */
+		if (zio->io_retries == 1) {
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		/*
+		 * This was not the first retry, so go through the
+		 * longer enqueue/delay/vdev_reopen() process.
+		 */
+		mutex_enter(&tvd->vdev_io_lock);
+		ASSERT(zio->io_retry_next == NULL);
+		zio->io_retry_next = zq = tvd->vdev_io_retry;
+		tvd->vdev_io_retry = zio;
+		mutex_exit(&tvd->vdev_io_lock);
+		if (zq == NULL)
+			(void) taskq_dispatch(
+			    tvd->vdev_spa->spa_vdev_retry_taskq,
+			    zio_vdev_io_retry, tvd, TQ_SLEEP);
+		return;
+	}
+
+	zio_next_stage(zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+	zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static void
+zio_checksum_generate(zio_t *zio)
+{
+	int checksum = zio->io_checksum;
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_gang_checksum_generate(zio_t *zio)
+{
+	zio_cksum_t zc;
+	zio_gbh_phys_t *gbh = zio->io_data;
+
+	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_checksum_verify(zio_t *zio)
+{
+	if (zio->io_bp != NULL) {
+		zio->io_error = zio_checksum_error(zio);
+		if (zio->io_error) {
+			dprintf("bad checksum on vdev %s\n",
+			    vdev_description(zio->io_vd));
+		}
+	}
+
+	zio_next_stage(zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+	zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
+	zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
+	zcp->zc_word[2] = zio->io_bp->blk_birth;
+	zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef void zio_pipe_stage_t(zio_t *zio);
+
+static void
+zio_badop(zio_t *zio)
+{
+	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
+}
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+	zio_badop,
+	zio_wait_children_ready,
+	zio_write_compress,
+	zio_checksum_generate,
+	zio_gang_pipeline,
+	zio_get_gang_header,
+	zio_rewrite_gang_members,
+	zio_free_gang_members,
+	zio_claim_gang_members,
+	zio_dva_allocate,
+	zio_dva_free,
+	zio_dva_claim,
+	zio_gang_checksum_generate,
+	zio_ready,
+	zio_dva_translate,
+	zio_vdev_io_setup,
+	zio_vdev_io_start,
+	zio_vdev_io_done,
+	zio_vdev_io_assess,
+	zio_wait_children_done,
+	zio_checksum_verify,
+	zio_read_gang_members,
+	zio_read_decompress,
+	zio_done,
+	zio_badop
+};
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_next_stage(zio_t *zio)
+{
+	uint32_t pipeline = zio->io_pipeline;
+
+	ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+	if (zio->io_error) {
+		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+		    zio, vdev_description(zio->io_vd),
+		    zio->io_offset, zio->io_stage, zio->io_error);
+		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+			pipeline &= ZIO_ERROR_PIPELINE_MASK;
+	}
+
+	while (((1U << ++zio->io_stage) & pipeline) == 0)
+		continue;
+
+	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+	ASSERT(zio->io_stalled == 0);
+
+	zio_pipeline[zio->io_stage](zio);
+}
+
+void
+zio_next_stage_async(zio_t *zio)
+{
+	taskq_t *tq;
+	uint32_t pipeline = zio->io_pipeline;
+
+	ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+	if (zio->io_error) {
+		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+		    zio, vdev_description(zio->io_vd),
+		    zio->io_offset, zio->io_stage, zio->io_error);
+		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+			pipeline &= ZIO_ERROR_PIPELINE_MASK;
+	}
+
+	while (((1U << ++zio->io_stage) & pipeline) == 0)
+		continue;
+
+	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+	ASSERT(zio->io_stalled == 0);
+
+	/*
+	 * For performance, we'll probably want two sets of task queues:
+	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
+	 * part is for read performance: since we have to make a pass over
+	 * the data to checksum it anyway, we want to do this on the same CPU
+	 * that issued the read, because (assuming CPU scheduling affinity)
+	 * that thread is probably still there.  Getting this optimization
+	 * right avoids performance-hostile cache-to-cache transfers.
+	 *
+	 * Note that having two sets of task queues is also necessary for
+	 * correctness: if all of the issue threads get bogged down waiting
+	 * for dependent reads (e.g. metaslab freelist) to complete, then
+	 * there won't be any threads available to service I/O completion
+	 * interrupts.
+	 */
+	if ((1U << zio->io_stage) & zio->io_async_stages) {
+		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
+			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+		else
+			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
+		(void) taskq_dispatch(tq,
+		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+	} else {
+		zio_pipeline[zio->io_stage](zio);
+	}
+}
+
+/*
+ * Try to allocate an intent log block.  Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
+    uint64_t txg)
+{
+	int error;
+
+	spa_config_enter(spa, RW_READER);
+
+	BP_ZERO(bp);
+
+	error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+
+	if (error == 0) {
+		BP_SET_CHECKSUM(bp, checksum);
+		BP_SET_LSIZE(bp, size);
+		BP_SET_PSIZE(bp, size);
+		BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+		BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(bp, 0);
+		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		bp->blk_birth = txg;
+	}
+
+	spa_config_exit(spa);
+
+	return (error);
+}
+
+/*
+ * Free an intent log block.  We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+
+	dprintf_bp(bp, "txg %llu: ", txg);
+
+	spa_config_enter(spa, RW_READER);
+
+	metaslab_free(spa, BP_IDENTITY(bp), txg);
+
+	spa_config_exit(spa);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 0000000000..dc31527ce8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed.  We support checksum vectors
+ * for three distinct reasons:
+ *
+ *   1. Different kinds of data need different levels of protection.
+ *	For SPA metadata, we always want a very strong checksum.
+ *	For user data, we let users make the trade-off between speed
+ *	and checksum strength.
+ *
+ *   2. Cryptographic hash and MAC algorithms are an area of active research.
+ *	It is likely that in future hash functions will be at least as strong
+ *	as current best-of-breed, and may be substantially faster as well.
+ *	We want the ability to take advantage of these new hashes as soon as
+ *	they become available.
+ *
+ *   3. If someone develops hardware that can compute a strong hash quickly,
+ *	we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength.  When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+	NULL,			NULL,			0, 0,	"inherit",
+	NULL,			NULL,			0, 0,	"on",
+	zio_checksum_off,	zio_checksum_off,	0, 0,	"off",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"label",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"gang_header",
+	fletcher_2_native,	fletcher_2_byteswap,	0, 1,	"zilog",
+	fletcher_2_native,	fletcher_2_byteswap,	0, 0,	"fletcher2",
+	fletcher_4_native,	fletcher_4_byteswap,	1, 0,	"fletcher4",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 0,	"SHA256",
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (ZIO_CHECKSUM_ON_VALUE);
+
+	return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t zbt_cksum;
+
+	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(ci->ci_func[0] != NULL);
+
+	if (ci->ci_zbt) {
+		*zcp = zbt->zbt_cksum;
+		zbt->zbt_magic = ZBT_MAGIC;
+		ci->ci_func[0](data, size, &zbt_cksum);
+		zbt->zbt_cksum = zbt_cksum;
+	} else {
+		ci->ci_func[0](data, size, zcp);
+	}
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	zio_cksum_t zc = bp->blk_cksum;
+	uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+	    BP_GET_CHECKSUM(bp);
+	int byteswap = BP_SHOULD_BYTESWAP(bp);
+	void *data = zio->io_data;
+	uint64_t size = zio->io_size;
+	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t actual_cksum, expected_cksum;
+
+	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+		return (EINVAL);
+
+	if (ci->ci_zbt) {
+		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+			zio_set_gang_verifier(zio, &zc);
+
+		if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+			expected_cksum = zbt->zbt_cksum;
+			byteswap_uint64_array(&expected_cksum,
+			    sizeof (zio_cksum_t));
+			zbt->zbt_cksum = zc;
+			byteswap_uint64_array(&zbt->zbt_cksum,
+			    sizeof (zio_cksum_t));
+			ci->ci_func[1](data, size, &actual_cksum);
+			zbt->zbt_cksum = expected_cksum;
+			byteswap_uint64_array(&zbt->zbt_cksum,
+			    sizeof (zio_cksum_t));
+		} else {
+			expected_cksum = zbt->zbt_cksum;
+			zbt->zbt_cksum = zc;
+			ci->ci_func[0](data, size, &actual_cksum);
+			zbt->zbt_cksum = expected_cksum;
+		}
+		zc = expected_cksum;
+	} else {
+		ASSERT(!DVA_GET_GANG(dva));
+		ci->ci_func[byteswap](data, size, &actual_cksum);
+	}
+
+	if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
+	    (actual_cksum.zc_word[1] - zc.zc_word[1]) |
+	    (actual_cksum.zc_word[2] - zc.zc_word[2]) |
+	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
+		return (ECKSUM);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 0000000000..51d85172bb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,134 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+	NULL,			NULL,			"inherit",
+	NULL,			NULL,			"on",
+	NULL,			NULL,			"uncompressed",
+	lzjb_compress,		lzjb_decompress,	"lzjb",
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+	if (child == ZIO_COMPRESS_INHERIT)
+		return (parent);
+
+	if (child == ZIO_COMPRESS_ON)
+		return (ZIO_COMPRESS_ON_VALUE);
+
+	return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+    uint64_t *destsizep, uint64_t *destbufsizep)
+{
+	uint64_t *word, *word_end;
+	uint64_t ciosize, gapsize, destbufsize;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	char *dest;
+	uint_t allzero;
+
+	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(ci->ci_compress != NULL);
+
+	/*
+	 * If the data is all zeroes, we don't even need to allocate
+	 * a block for it.  We indicate this by setting *destsizep = 0.
+	 */
+	allzero = 1;
+	word = src;
+	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+	while (word < word_end) {
+		if (*word++ != 0) {
+			allzero = 0;
+			break;
+		}
+	}
+	if (allzero) {
+		*destp = NULL;
+		*destsizep = 0;
+		*destbufsizep = 0;
+		return (1);
+	}
+
+	/* Compress at least 12.5% */
+	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+	if (destbufsize == 0)
+		return (0);
+	dest = zio_buf_alloc(destbufsize);
+
+	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+	    (size_t)destbufsize);
+	if (ciosize > destbufsize) {
+		zio_buf_free(dest, destbufsize);
+		return (0);
+	}
+
+	/* Cool.  We compressed at least as much as we were hoping to. */
+
+	/* For security, make sure we don't write random heap crap to disk */
+	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+	if (gapsize != 0) {
+		bzero(dest + ciosize, gapsize);
+		ciosize += gapsize;
+	}
+
+	ASSERT3U(ciosize, <=, destbufsize);
+	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+	*destp = dest;
+	*destsizep = ciosize;
+	*destbufsizep = destbufsize;
+
+	return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+	void *dest, uint64_t destsize)
+{
+	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+	return (zio_compress_table[cpfunc].ci_decompress(src, dest,
+	    srcsize, destsize));
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
new file mode 100644
index 0000000000..ceb9e24d72
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,793 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the ZFS-specific devfsadm link generator.
+ * Volumes are persistent through reboot.  No user command needs to be
+ * run before opening and using a device.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/aio_req.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dsl_prop.h>
+#include <sys/dkio.h>
+#include <sys/efi_partition.h>
+#include <sys/byteorder.h>
+#include <sys/pathname.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/crc32.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mkdev.h>
+
+#include "zfs_namecheck.h"
+
+#define	ZVOL_OBJ		1ULL
+#define	ZVOL_ZAP_OBJ		2ULL
+#define	ZVOL_MAX_MINOR		MAXMIN32
+
+static void *zvol_state;
+
+/*
+ * This lock protects the zvol_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes.  It also protects temporary opens of the dataset so that,
+ * e.g., an open doesn't get a spurious EBUSY.
+ */
+static kmutex_t zvol_state_lock;
+static uint32_t zvol_minors;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+	char		zv_name[MAXPATHLEN]; /* pool/dd name */
+	uint64_t	zv_volsize;	/* amount of space we advertise */
+	minor_t		zv_minor;	/* minor number */
+	uint8_t		zv_min_bs;	/* minimum addressable block shift */
+	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
+	objset_t	*zv_objset;	/* objset handle */
+	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
+	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
+	uint32_t	zv_total_opens;	/* total open count */
+} zvol_state_t;
+
+static void
+zvol_size_changed(zvol_state_t *zv, dev_t dev)
+{
+	dev = makedevice(getmajor(dev), zv->zv_minor);
+
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Size", zv->zv_volsize) == DDI_SUCCESS);
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
+}
+
+int
+zvol_check_volsize(zfs_cmd_t *zc)
+{
+	if (zc->zc_volsize == 0)
+		return (EINVAL);
+
+	zc->zc_volsize = P2ROUNDUP(zc->zc_volsize, SPA_MAXBLOCKSIZE);
+#ifdef _ILP32
+	if (zc->zc_volsize - 1 > SPEC_MAXOFFSET_T)
+		return (EOVERFLOW);
+#endif
+	return (0);
+}
+
+int
+zvol_check_volblocksize(zfs_cmd_t *zc)
+{
+	if (zc->zc_volblocksize < SPA_MINBLOCKSIZE ||
+	    zc->zc_volblocksize > SPA_MAXBLOCKSIZE ||
+	    !ISP2(zc->zc_volblocksize))
+		return (EDOM);
+
+	return (0);
+}
+
+static void
+zvol_readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zvol_state_t *zv = arg;
+
+	zv->zv_readonly = (uint8_t)newval;
+}
+
+int
+zvol_get_stats(zfs_cmd_t *zc, objset_t *os)
+{
+	int error;
+	dmu_object_info_t doi;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize);
+
+	if (error)
+		return (error);
+
+	error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+	if (error == 0)
+		zc->zc_volblocksize = doi.doi_data_block_size;
+
+	return (error);
+}
+
+/*
+ * Find a free minor number.
+ */
+static minor_t
+zvol_minor_alloc(void)
+{
+	minor_t minor;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
+		if (ddi_get_soft_state(zvol_state, minor) == NULL)
+			return (minor);
+
+	return (0);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(char *name)
+{
+	minor_t minor;
+	zvol_state_t *zv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+		zv = ddi_get_soft_state(zvol_state, minor);
+		if (zv == NULL)
+			continue;
+		if (strcmp(zv->zv_name, name) == 0)
+			break;
+	}
+
+	return (zv);
+}
+
+void
+zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+	zfs_cmd_t *zc = arg;
+	int error;
+
+	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, zc->zc_volblocksize,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize, tx);
+	ASSERT(error == 0);
+}
+
+/*
+ * Create a minor node for the specified volume.
+ */
+int
+zvol_create_minor(zfs_cmd_t *zc)
+{
+	char *name = zc->zc_name;
+	dev_t dev = zc->zc_dev;
+	zvol_state_t *zv;
+	objset_t *os;
+	uint64_t volsize;
+	minor_t minor = 0;
+	struct pathname linkpath;
+	int ds_mode = DS_MODE_PRIMARY;
+	vnode_t *vp = NULL;
+	char *devpath;
+	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
+	char chrbuf[30], blkbuf[30];
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(name)) != NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (EEXIST);
+	}
+
+	if (strchr(name, '@') != 0)
+		ds_mode |= DS_MODE_READONLY;
+
+	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+
+	if (error) {
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+
+	if (error) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	/*
+	 * If there's an existing /dev/zvol symlink, try to use the
+	 * same minor number we used last time.
+	 */
+	devpath = kmem_alloc(devpathlen, KM_SLEEP);
+
+	(void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
+
+	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
+
+	kmem_free(devpath, devpathlen);
+
+	if (error == 0 && vp->v_type != VLNK)
+		error = EINVAL;
+
+	if (error == 0) {
+		pn_alloc(&linkpath);
+		error = pn_getsymlink(vp, &linkpath, kcred);
+		if (error == 0) {
+			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
+			if (ms != NULL) {
+				ms += strlen(ZVOL_PSEUDO_DEV);
+				minor = stoi(&ms);
+			}
+		}
+		pn_free(&linkpath);
+	}
+
+	if (vp != NULL)
+		VN_RELE(vp);
+
+	/*
+	 * If we found a minor but it's already in use, we must pick a new one.
+	 */
+	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
+		minor = 0;
+
+	if (minor == 0)
+		minor = zvol_minor_alloc();
+
+	if (minor == 0) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, name);
+
+	(void) sprintf(chrbuf, "%uc,raw", minor);
+
+	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_soft_state_free(zvol_state, minor);
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	(void) sprintf(blkbuf, "%uc", minor);
+
+	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_remove_minor_node(zfs_dip, chrbuf);
+		ddi_soft_state_free(zvol_state, minor);
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+
+	(void) strcpy(zv->zv_name, name);
+	zv->zv_min_bs = DEV_BSHIFT;
+	zv->zv_minor = minor;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+	zv->zv_mode = ds_mode;
+
+	zvol_size_changed(zv, dev);
+
+	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
+	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+	zvol_minors++;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+int
+zvol_remove_minor(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	char namebuf[30];
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_total_opens != 0) {
+		mutex_exit(&zvol_state_lock);
+		return (EBUSY);
+	}
+
+	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, namebuf);
+
+	(void) sprintf(namebuf, "%uc", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, namebuf);
+
+	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
+	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+	dmu_objset_close(zv->zv_objset);
+
+	zv->zv_objset = NULL;
+
+	ddi_soft_state_free(zvol_state, zv->zv_minor);
+
+	zvol_minors--;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+int
+zvol_set_volsize(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	dev_t dev = zc->zc_dev;
+	dmu_tx_t *tx;
+	int error;
+
+	if ((error = zvol_check_volsize(zc)) != 0)
+		return (error);
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+	dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+	    &zc->zc_volsize, tx);
+	if (error == 0)
+		dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+		    DMU_OBJECT_END, tx);
+
+	dmu_tx_commit(tx);
+
+	if (error == 0) {
+		zv->zv_volsize = zc->zc_volsize;
+		zvol_size_changed(zv, dev);
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (error);
+}
+
+int
+zvol_set_volblocksize(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	dmu_tx_t *tx;
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+		    zc->zc_volblocksize, 0, tx);
+		if (error == ENOTSUP)
+			error = EBUSY;
+		dmu_tx_commit(tx);
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+	minor_t minor = getminor(*devp);
+	zvol_state_t *zv;
+
+	if (minor == 0)			/* This is the control device */
+		return (0);
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	ASSERT(zv->zv_objset != NULL);
+
+	if ((flag & FWRITE) &&
+	    (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+		zv->zv_open_count[otyp]++;
+		zv->zv_total_opens++;
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+	minor_t minor = getminor(dev);
+	zvol_state_t *zv;
+
+	if (minor == 0)		/* This is the control device */
+		return (0);
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	/*
+	 * The next statement is a workaround for the following DDI bug:
+	 * 6343604 specfs race: multiple "last-close" of the same device
+	 */
+	if (zv->zv_total_opens == 0) {
+		mutex_exit(&zvol_state_lock);
+		return (0);
+	}
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_open_count[otyp] != 0);
+	ASSERT(zv->zv_total_opens != 0);
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count[otyp]--;
+	zv->zv_total_opens--;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+int
+zvol_strategy(buf_t *bp)
+{
+	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
+	uint64_t off, volsize;
+	size_t size, resid;
+	char *addr;
+	int error = 0;
+
+	if (zv == NULL) {
+		bioerror(bp, ENXIO);
+		biodone(bp);
+		return (0);
+	}
+
+	if (getminor(bp->b_edev) == 0) {
+		bioerror(bp, EINVAL);
+		biodone(bp);
+		return (0);
+	}
+
+	if (zv->zv_readonly && !(bp->b_flags & B_READ)) {
+		bioerror(bp, EROFS);
+		biodone(bp);
+		return (0);
+	}
+
+	off = ldbtob(bp->b_blkno);
+	volsize = zv->zv_volsize;
+
+	ASSERT(zv->zv_objset != NULL);
+
+	bp_mapin(bp);
+	addr = bp->b_un.b_addr;
+	resid = bp->b_bcount;
+
+	while (resid != 0 && off < volsize) {
+
+		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */
+
+		if (size > volsize - off)	/* don't write past the end */
+			size = volsize - off;
+
+		if (bp->b_flags & B_READ) {
+			error = dmu_read_canfail(zv->zv_objset, ZVOL_OBJ,
+			    off, size, addr);
+		} else {
+			dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error) {
+				dmu_tx_abort(tx);
+			} else {
+				dmu_write(zv->zv_objset, ZVOL_OBJ,
+				    off, size, addr, tx);
+				dmu_tx_commit(tx);
+			}
+		}
+		if (error)
+			break;
+		off += size;
+		addr += size;
+		resid -= size;
+	}
+
+	if ((bp->b_resid = resid) == bp->b_bcount)
+		bioerror(bp, off > volsize ? EINVAL : error);
+
+	biodone(bp);
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+	return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+	return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+	return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
+}
+
+/*ARGSUSED*/
+int
+zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
+}
+
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+	zvol_state_t *zv;
+	struct dk_cinfo dkc;
+	struct dk_minfo dkm;
+	dk_efi_t efi;
+	efi_gpt_t gpt;
+	efi_gpe_t gpe;
+	struct uuid uuid = EFI_RESERVED;
+	uint32_t crc;
+	int error = 0;
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, getminor(dev));
+
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+
+	case DKIOCINFO:
+		bzero(&dkc, sizeof (dkc));
+		(void) strcpy(dkc.dki_cname, "zvol");
+		(void) strcpy(dkc.dki_dname, "zvol");
+		dkc.dki_ctype = DKC_UNKNOWN;
+		dkc.dki_maxtransfer = 1 << 15;
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag))
+			error = EFAULT;
+		return (error);
+
+	case DKIOCGMEDIAINFO:
+		bzero(&dkm, sizeof (dkm));
+		dkm.dki_lbsize = 1U << zv->zv_min_bs;
+		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+		dkm.dki_media_type = DK_UNKNOWN;
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+			error = EFAULT;
+		return (error);
+
+	case DKIOCGETEFI:
+		if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
+			mutex_exit(&zvol_state_lock);
+			return (EFAULT);
+		}
+
+		bzero(&gpt, sizeof (gpt));
+		bzero(&gpe, sizeof (gpe));
+
+		efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
+
+		if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
+			mutex_exit(&zvol_state_lock);
+			return (EINVAL);
+		}
+
+		efi.dki_length = sizeof (gpt) + sizeof (gpe);
+
+		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+		gpt.efi_gpt_Revision = LE_32(EFI_VERSION102);
+		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+		gpt.efi_gpt_FirstUsableLBA = LE_64(0ULL);
+		gpt.efi_gpt_LastUsableLBA =
+		    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
+		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+		gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
+
+		UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+		gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
+		gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
+
+		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+
+		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag) ||
+		    ddi_copyout(&gpe, efi.dki_data + 1, sizeof (gpe), flag))
+			error = EFAULT;
+		return (error);
+
+	default:
+		error = ENOTSUP;
+		break;
+
+	}
+	mutex_exit(&zvol_state_lock);
+	return (error);
+}
+
+int
+zvol_busy(void)
+{
+	return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
+	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zvol_fini(void)
+{
+	mutex_destroy(&zvol_state_lock);
+	ddi_soft_state_fini(&zvol_state);
+}
author	ahrens <none@none>	2005-10-31 11:33:35 -0800
committer	ahrens <none@none>	2005-10-31 11:33:35 -0800
commit	fa9e4066f08beec538e775443c5be79dd423fcab (patch)
tree	576d99665e57bb7cb70584431adb08c14d47e3ce /usr/src/uts/common/fs
parent	f1b64740276f67fc6914c1d855f2af601efe99ac (diff)
download	illumos-gate-fa9e4066f08beec538e775443c5be79dd423fcab.tar.gz