summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
authorahrens <none@none>2005-10-31 11:33:35 -0800
committerahrens <none@none>2005-10-31 11:33:35 -0800
commitfa9e4066f08beec538e775443c5be79dd423fcab (patch)
tree576d99665e57bb7cb70584431adb08c14d47e3ce /usr/src/uts/common/fs
parentf1b64740276f67fc6914c1d855f2af601efe99ac (diff)
downloadillumos-gate-fa9e4066f08beec538e775443c5be79dd423fcab.tar.gz
PSARC 2002/240 ZFS
6338653 Integrate ZFS PSARC 2004/652 - DKIOCFLUSH 5096886 Write caching disks need mechanism to flush cache to physical media
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/ctfs/ctfs_all.c4
-rw-r--r--usr/src/uts/common/fs/ctfs/ctfs_ctl.c4
-rw-r--r--usr/src/uts/common/fs/ctfs/ctfs_event.c4
-rw-r--r--usr/src/uts/common/fs/ctfs/ctfs_tdir.c4
-rw-r--r--usr/src/uts/common/fs/devfs/devfs_subr.c32
-rw-r--r--usr/src/uts/common/fs/devfs/devfs_vnops.c10
-rw-r--r--usr/src/uts/common/fs/fs_subr.c84
-rw-r--r--usr/src/uts/common/fs/fs_subr.h3
-rw-r--r--usr/src/uts/common/fs/lookup.c41
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_acl.c203
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_attr.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_acl_srv.c42
-rw-r--r--usr/src/uts/common/fs/proc/prioctl.c18
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c22
-rw-r--r--usr/src/uts/common/fs/vnode.c45
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1998
-rw-r--r--usr/src/uts/common/fs/zfs/bplist.c239
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c2022
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c1761
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_object.c149
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c727
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c792
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c801
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_zfetch.c603
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c1304
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c560
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c1463
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c1217
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c233
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_prop.c367
-rw-r--r--usr/src/uts/common/fs/zfs/fletcher.c100
-rw-r--r--usr/src/uts/common/fs/zfs/lzjb.c125
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c796
-rw-r--r--usr/src/uts/common/fs/zfs/refcount.c194
-rw-r--r--usr/src/uts/common/fs/zfs/sha256.c131
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c1784
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c308
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c848
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c406
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h90
-rw-r--r--usr/src/uts/common/fs/zfs/sys/bplist.h83
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h302
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h635
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_impl.h230
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h122
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_traverse.h125
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_tx.h154
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h76
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h301
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h164
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dir.h143
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h82
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_prop.h75
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h73
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h125
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h105
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h406
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h118
-rw-r--r--usr/src/uts/common/fs/zfs/sys/space_map.h144
-rw-r--r--usr/src/uts/common/fs/zfs/sys/txg.h120
-rw-r--r--usr/src/uts/common/fs/zfs/sys/txg_impl.h77
-rw-r--r--usr/src/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--usr/src/uts/common/fs/zfs/sys/uberblock_impl.h76
-rw-r--r--usr/src/uts/common/fs/zfs/sys/unique.h56
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h135
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_disk.h52
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h287
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap.h353
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_impl.h190
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_leaf.h204
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_acl.h113
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_context.h71
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h74
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_debug.h73
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_dir.h70
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h187
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h116
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h283
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h242
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil_impl.h111
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h298
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_checksum.h74
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_compress.h75
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h208
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c583
-rw-r--r--usr/src/uts/common/fs/zfs/uberblock.c67
-rw-r--r--usr/src/uts/common/fs/zfs/unique.c107
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c1738
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_cache.c374
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c307
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c223
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c848
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c414
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c89
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c286
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c599
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_root.c98
-rw-r--r--usr/src/uts/common/fs/zfs/zap.c1010
-rw-r--r--usr/src/uts/common/fs/zfs/zap_leaf.c883
-rw-r--r--usr/src/uts/common/fs/zfs/zap_micro.c823
-rw-r--r--usr/src/uts/common/fs/zfs/zfs.conf28
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_acl.c1537
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_byteswap.c99
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ctldir.c936
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c853
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c1323
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_log.c337
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_replay.c337
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c1072
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c3663
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c1286
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c1242
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c1698
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c174
-rw-r--r--usr/src/uts/common/fs/zfs/zio_compress.c134
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c793
118 files changed, 50463 insertions, 146 deletions
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_all.c b/usr/src/uts/common/fs/ctfs/ctfs_all.c
index dd3eeb15b6..4933edd960 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_all.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_all.c
@@ -99,7 +99,7 @@ ctfs_adir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
if (*nm != '\0')
return (ENOENT);
- ct = contract_ptr(i, VTOZ(vp)->zone_uniqid);
+ ct = contract_ptr(i, VTOZONE(vp)->zone_uniqid);
if (ct == NULL)
return (ENOENT);
@@ -118,7 +118,7 @@ ctfs_adir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
uint64_t zuniqid;
ctid_t next;
- zuniqid = VTOZ(vp)->zone_uniqid;
+ zuniqid = VTOZONE(vp)->zone_uniqid;
next = contract_lookup(zuniqid, *offp);
if (next == -1) {
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
index a13091826c..f4980d4a97 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
@@ -249,11 +249,11 @@ ctfs_stat_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
detail = STRUCT_FGET(st, ctst_detail);
if (detail == CTD_COMMON) {
mutex_enter(&ct->ct_lock);
- contract_status_common(ct, VTOZ(vp), STRUCT_BUF(st), mdl);
+ contract_status_common(ct, VTOZONE(vp), STRUCT_BUF(st), mdl);
mutex_exit(&ct->ct_lock);
} else if (detail <= CTD_ALL) {
VERIFY(nvlist_alloc(&foo, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- type->ct_type_ops->contop_status(ct, VTOZ(vp), detail, foo,
+ type->ct_type_ops->contop_status(ct, VTOZONE(vp), detail, foo,
STRUCT_BUF(st), mdl);
VERIFY(nvlist_pack(foo, &bufp, &len, NV_ENCODE_NATIVE,
KM_SLEEP) == 0);
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_event.c b/usr/src/uts/common/fs/ctfs/ctfs_event.c
index afb08a7cfc..7fa7cfb608 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_event.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_event.c
@@ -287,7 +287,7 @@ ctfs_ev_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
ctfs_evnode_t *evnode = vp->v_data;
return (ctfs_endpoint_ioctl(&evnode->ctfs_ev_listener, cmd, arg, cr,
- VTOZ(vp), 0));
+ VTOZONE(vp), 0));
}
/*
@@ -430,7 +430,7 @@ ctfs_bu_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
ctfs_bunode_t *bunode = vp->v_data;
return (ctfs_endpoint_ioctl(&bunode->ctfs_bu_listener, cmd, arg, cr,
- VTOZ(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
+ VTOZONE(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
}
/*
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
index 479f64b064..1f5dd42370 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
@@ -108,7 +108,7 @@ ctfs_tdir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
ctid_t next;
ct_type_t *ty = ct_types[gfs_file_index(vp)];
- zuniqid = VTOZ(vp)->zone_uniqid;
+ zuniqid = VTOZONE(vp)->zone_uniqid;
next = contract_type_lookup(ty, zuniqid, *offp);
if (next == -1) {
@@ -135,7 +135,7 @@ ctfs_tdir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
return (ENOENT);
ct = contract_type_ptr(ct_types[gfs_file_index(vp)], i,
- VTOZ(vp)->zone_uniqid);
+ VTOZONE(vp)->zone_uniqid);
if (ct == NULL)
return (ENOENT);
diff --git a/usr/src/uts/common/fs/devfs/devfs_subr.c b/usr/src/uts/common/fs/devfs/devfs_subr.c
index 0f53a24ca0..864ed2ad60 100644
--- a/usr/src/uts/common/fs/devfs/devfs_subr.c
+++ b/usr/src/uts/common/fs/devfs/devfs_subr.c
@@ -569,20 +569,6 @@ dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
}
/*
- * Free a vsecattr
- */
-static void
-dv_free_vsa(struct vsecattr *vsap)
-{
- if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp)
- kmem_free(vsap->vsa_aclentp,
- vsap->vsa_aclcnt * sizeof (aclent_t));
- if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp)
- kmem_free(vsap->vsa_dfaclentp,
- vsap->vsa_dfaclcnt * sizeof (aclent_t));
-}
-
-/*
* dv_shadow_node
*
* Given a VDIR dv_node, find/create the associated VDIR
@@ -623,7 +609,6 @@ dv_shadow_node(
int create_tried;
int error;
mperm_t mp;
- struct vsecattr vsa;
ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
dv = VTODV(vp);
@@ -678,19 +663,14 @@ lookup:
dv->dv_attrvp = rvp; /* with one hold */
/*
- * Determine if we have (non-trivial) ACLs on this node.
- * NB: This should be changed call fs_acl_nontrivial for
- * new ACE flavor ACLs.
+ * Determine if we have non-trivial ACLs on this node.
+ * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
+ * only does VOP_GETSECATTR.
*/
- vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
- error = VOP_GETSECATTR(rvp, &vsa, 0, cred);
dv->dv_flags &= ~DV_ACL;
- if (error == 0) {
- if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) {
- dv->dv_flags |= DV_ACL; /* non-trivial ACL */
- }
- dv_free_vsa(&vsa);
- }
+
+ if (fs_acl_nontrivial(rvp, cred))
+ dv->dv_flags |= DV_ACL;
/*
* If we have synced out the memory attributes, free
diff --git a/usr/src/uts/common/fs/devfs/devfs_vnops.c b/usr/src/uts/common/fs/devfs/devfs_vnops.c
index 7a3d4c1c04..b8dfce5448 100644
--- a/usr/src/uts/common/fs/devfs/devfs_vnops.c
+++ b/usr/src/uts/common/fs/devfs/devfs_vnops.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -621,7 +621,6 @@ devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
error = VOP_GETSECATTR(avp, vsap, flags, cr);
dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
-
rw_exit(&dv->dv_contents);
return (error);
}
@@ -678,10 +677,11 @@ devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
/*
- * NB: This code should call fs_acl_nontrivial when available so that
- * DV_ACL is only set on nontrivial ACLs.
+ * Set DV_ACL if we have a non-trivial set of ACLs. It is not
+ * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
+ * VOP_GETSECATTR calls.
*/
- if (error == 0)
+ if (fs_acl_nontrivial(avp, cr))
dv->dv_flags |= DV_ACL;
return (error);
}
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 7fc9dc4277..3466db3832 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -24,7 +24,7 @@
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,6 +57,7 @@
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/nbmlock.h>
+#include <acl/acl_common.h>
static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
@@ -632,3 +633,84 @@ fs_vnevent_support(vnode_t *vp, vnevent_t vnevent)
ASSERT(vp != NULL);
return (0);
}
+
+/*
+ * return 1 for non-trivial ACL.
+ *
+ * NB: It is not necessary for the caller to VOP_RWLOCK since
+ * we only issue VOP_GETSECATTR.
+ *
+ * Returns 0 == trivial
+ * 1 == NOT Trivial
+ * <0 could not determine.
+ */
+int
+fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
+{
+ ulong_t acl_styles;
+ ulong_t acl_flavor;
+ vsecattr_t vsecattr;
+ int error;
+ int isnontrivial;
+
+ /* determine the forms of ACLs maintained */
+ error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr);
+
+ /* clear bits we don't understand and establish default acl_style */
+ acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
+ if (error || (acl_styles == 0))
+ acl_styles = _ACL_ACLENT_ENABLED;
+
+ vsecattr.vsa_aclentp = NULL;
+ vsecattr.vsa_dfaclentp = NULL;
+ vsecattr.vsa_aclcnt = 0;
+ vsecattr.vsa_dfaclcnt = 0;
+
+ while (acl_styles) {
+ /* select one of the styles as current flavor */
+ acl_flavor = 0;
+ if (acl_styles & _ACL_ACLENT_ENABLED) {
+ acl_flavor = _ACL_ACLENT_ENABLED;
+ vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
+ } else if (acl_styles & _ACL_ACE_ENABLED) {
+ acl_flavor = _ACL_ACE_ENABLED;
+ vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
+ }
+
+ ASSERT(vsecattr.vsa_mask && acl_flavor);
+ error = VOP_GETSECATTR(vp, &vsecattr, 0, cr);
+ if (error == 0)
+ break;
+
+ /* that flavor failed */
+ acl_styles &= ~acl_flavor;
+ }
+
+ /* if all styles fail then assume trivial */
+ if (acl_styles == 0)
+ return (0);
+
+ /* process the flavor that worked */
+ isnontrivial = 0;
+ if (acl_flavor & _ACL_ACLENT_ENABLED) {
+ if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
+ isnontrivial = 1;
+ if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+ kmem_free(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt * sizeof (aclent_t));
+ if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
+ kmem_free(vsecattr.vsa_dfaclentp,
+ vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
+ }
+ if (acl_flavor & _ACL_ACE_ENABLED) {
+
+ isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt);
+
+ if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+ kmem_free(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt * sizeof (ace_t));
+ /* ACE has no vsecattr.vsa_dfaclcnt */
+ }
+ return (isnontrivial);
+}
diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h
index 27fc845718..8cd453edba 100644
--- a/usr/src/uts/common/fs/fs_subr.h
+++ b/usr/src/uts/common/fs/fs_subr.h
@@ -23,7 +23,7 @@
/* All Rights Reserved */
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -81,6 +81,7 @@ extern int fs_shrlock(struct vnode *, int, struct shrlock *, int,
cred_t *);
extern int fs_vnevent_nosupport(vnode_t *, vnevent_t);
extern int fs_vnevent_support(vnode_t *, vnevent_t);
+extern int fs_acl_nontrivial(struct vnode *vp, struct cred *cr);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 7fd7f66510..b7fdf996e2 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -789,7 +789,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
size_t dbuflen;
struct iovec iov;
struct uio uio;
- int err;
+ int error;
int eof;
vnode_t *cmpvp;
struct dirent64 *dp;
@@ -811,8 +811,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = 0;
- if ((err = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
- return (err);
+ if ((error = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
+ return (error);
while (!eof) {
uio.uio_resid = dlen;
@@ -820,12 +820,12 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
iov.iov_len = dlen;
(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
- err = VOP_READDIR(dvp, &uio, cr, &eof);
+ error = VOP_READDIR(dvp, &uio, cr, &eof);
VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
dbuflen = dlen - uio.uio_resid;
- if (err || dbuflen == 0)
+ if (error || dbuflen == 0)
break;
dp = (dirent64_t *)dbuf;
@@ -840,7 +840,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
continue;
}
- err = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
+ error = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
vrootp, cr);
/*
@@ -849,7 +849,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
* just removed an entry since the readdir() call, and
* the entry we want is further on in the directory.
*/
- if (err == 0) {
+ if (error == 0) {
if (vnode_match(tvp, cmpvp, cr)) {
VN_RELE(cmpvp);
*rdp = dp;
@@ -857,8 +857,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
}
VN_RELE(cmpvp);
- } else if (err != ENOENT) {
- return (err);
+ } else if (error != ENOENT) {
+ return (error);
}
dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
@@ -868,13 +868,26 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
/*
* Something strange has happened, this directory does not contain the
* specified vnode. This should never happen in the normal case, since
- * we ensured that dvp is the parent of vp. This may be possible in
- * some race conditions, so fail gracefully.
+ * we ensured that dvp is the parent of vp. This is possible in some
+ * rare conditions (races and the special .zfs directory).
*/
- if (err == 0)
- err = ENOENT;
+ if (error == 0) {
+ error = VOP_LOOKUP(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr);
+ if (error == 0) {
+ if (vnode_match(tvp, cmpvp, cr)) {
+ (void) strcpy(dp->d_name, ".zfs");
+ dp->d_reclen = strlen(".zfs");
+ dp->d_off = 2;
+ dp->d_ino = 1;
+ *rdp = dp;
+ } else {
+ error = ENOENT;
+ }
+ VN_RELE(cmpvp);
+ }
+ }
- return (err);
+ return (error);
}
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_acl.c b/usr/src/uts/common/fs/nfs/nfs4_acl.c
index 9b584f6256..96aa1756e9 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_acl.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_acl.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -80,10 +80,15 @@ static int ace4_list_to_aent(ace4_list_t *, aclent_t **, int *, uid_t, gid_t,
static int ln_ace4_to_aent(nfsace4 *ace4, int n, uid_t, gid_t,
aclent_t **, int *, aclent_t **, int *, int, int, int);
static int ace4_cmp(nfsace4 *, nfsace4 *);
-static int acet_to_ace4(ace_t *, nfsace4 *, int, int);
-static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int, int);
+static int acet_to_ace4(ace_t *, nfsace4 *, int);
+static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int);
static int validate_idmapping(utf8string *, uid_t, int, int, int);
static int u8s_mapped_to_nobody(utf8string *, uid_t, int);
+static void ace4_mask_to_acet_mask(acemask4, uint32_t *);
+static void acet_mask_to_ace4_mask(uint32_t, acemask4 *);
+static void ace4_flags_to_acet_flags(aceflag4, uint16_t *);
+static void acet_flags_to_ace4_flags(uint16_t, aceflag4 *);
+
/*
* The following two functions check and set ACE4_SYNCRONIZE, ACE4_WRITE_OWNER,
* ACE4_DELETE and ACE4_WRITE_ATTRIBUTES.
@@ -1651,7 +1656,7 @@ ln_ace4_cmp(nfsace4 *a, nfsace4* b, int n)
* strings versus integer uid/gids.
*/
static int
-acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
+acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isserver)
{
int error = 0;
@@ -1669,44 +1674,45 @@ acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
}
switch (ace->a_type) {
- case ALLOW:
+ case ACE_ACCESS_ALLOWED_ACE_TYPE:
nfsace4->type = ACE4_ACCESS_ALLOWED_ACE_TYPE;
break;
- case DENY:
+ case ACE_ACCESS_DENIED_ACE_TYPE:
nfsace4->type = ACE4_ACCESS_DENIED_ACE_TYPE;
break;
default:
+ NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+ "acet_to_ace4: unsupported type: %x", ace->a_type));
error = ENOTSUP;
break;
}
if (error != 0)
goto out;
- nfsace4->access_mask = mode_to_ace4_access(ace->a_access_mask,
- isdir, ace->a_flags & ACE_OWNER, ace->a_type == ALLOW, isserver);
+ acet_mask_to_ace4_mask(ace->a_access_mask, &nfsace4->access_mask);
+ acet_flags_to_ace4_flags(ace->a_flags, &nfsace4->flag);
- nfsace4->flag = (ace->a_flags & ACE_NFSV4_SUP_FLAGS);
- if (ace->a_flags & ACE_GROUPS) {
+ if (ace->a_flags & ACE_GROUP) {
+ nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
+ (void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
+ } else if (ace->a_flags & ACE_IDENTIFIER_GROUP) {
nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
error = nfs_idmap_gid_str(ace->a_who, &nfsace4->who, isserver);
- } else if (ace->a_flags & ACE_USER) {
- error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+ if (error != 0)
+ NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+ "acet_to_ace4: idmap failed with %d", error));
} else if (ace->a_flags & ACE_OWNER) {
(void) str_to_utf8(ACE4_WHO_OWNER, &nfsace4->who);
- } else if (ace->a_flags & ACE_GROUP) {
- nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
- (void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
- } else if (ace->a_flags & ACE_OTHER) {
+ } else if (ace->a_flags & ACE_EVERYONE) {
(void) str_to_utf8(ACE4_WHO_EVERYONE, &nfsace4->who);
+ } else {
+ error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+ if (error != 0)
+ NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+ "acet_to_ace4: idmap failed with %d", error));
}
out:
-#ifdef DEBUG
- if (error != 0)
- NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
- "acet_to_ace4: idmap failed with %d", error));
-#endif
-
return (error);
}
@@ -1716,10 +1722,9 @@ out:
*/
static int
ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
- int isdir, int isserver, int just_count)
+ int isserver, int just_count)
{
int error = 0;
- o_mode_t mode;
if (nfsace4 == NULL) {
NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
@@ -1734,12 +1739,14 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
switch (nfsace4->type) {
case ACE4_ACCESS_ALLOWED_ACE_TYPE:
- ace->a_type = ALLOW;
+ ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
break;
case ACE4_ACCESS_DENIED_ACE_TYPE:
- ace->a_type = DENY;
+ ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
break;
default:
+ NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+ "ace4_to_acet: unsupported type: %x", nfsace4->type));
error = ENOTSUP;
break;
}
@@ -1761,16 +1768,15 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
goto out;
}
- ace->a_access_mask = nfsace4->access_mask;
- error = ace4_mask_to_mode(nfsace4->access_mask, &mode, isdir);
- if (error != 0)
- goto out;
- ace->a_access_mask = mode;
- if (nfsace4->flag & ~(ACE_NFSV4_SUP_FLAGS | ACE4_IDENTIFIER_GROUP)) {
+ ace4_mask_to_acet_mask(nfsace4->access_mask, &ace->a_access_mask);
+
+ if (nfsace4->flag & ~ACE_NFSV4_SUP_FLAGS) {
+ NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+ "ace4_to_acet: unsupported flags: %x", nfsace4->flag));
error = ENOTSUP;
goto out;
}
- ace->a_flags = (nfsace4->flag & ACE_NFSV4_SUP_FLAGS);
+ ace4_flags_to_acet_flags(nfsace4->flag, &ace->a_flags);
if (nfsace4->flag & ACE4_IDENTIFIER_GROUP) {
if ((nfsace4->who.utf8string_len == 6) &&
@@ -1780,7 +1786,7 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
ace->a_flags |= ACE_GROUP;
error = 0;
} else {
- ace->a_flags |= ACE_GROUPS;
+ ace->a_flags |= ACE_IDENTIFIER_GROUP;
error = nfs_idmap_str_gid(&nfsace4->who,
&ace->a_who, isserver);
if (error != 0) {
@@ -1807,10 +1813,9 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
} else if ((nfsace4->who.utf8string_len == 9) &&
(bcmp(ACE4_WHO_EVERYONE,
nfsace4->who.utf8string_val, 9) == 0)) {
- ace->a_flags |= ACE_OTHER;
+ ace->a_flags |= ACE_EVERYONE;
ace->a_who = 0;
} else {
- ace->a_flags |= ACE_USER;
error = nfs_idmap_str_uid(&nfsace4->who,
&ace->a_who, isserver);
if (error != 0) {
@@ -1830,18 +1835,124 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
}
out:
-#ifdef DEBUG
- if (error != 0)
- NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
- "ace4_to_acet: idmap failed with %d", error));
-#endif
-
return (error);
}
+static void
+ace4_mask_to_acet_mask(acemask4 ace4_mask, uint32_t *acet_mask)
+{
+ *acet_mask = 0;
+
+ if (ace4_mask & ACE4_READ_DATA)
+ *acet_mask |= ACE_READ_DATA;
+ if (ace4_mask & ACE4_WRITE_DATA)
+ *acet_mask |= ACE_WRITE_DATA;
+ if (ace4_mask & ACE4_APPEND_DATA)
+ *acet_mask |= ACE_APPEND_DATA;
+ if (ace4_mask & ACE4_READ_NAMED_ATTRS)
+ *acet_mask |= ACE_READ_NAMED_ATTRS;
+ if (ace4_mask & ACE4_WRITE_NAMED_ATTRS)
+ *acet_mask |= ACE_WRITE_NAMED_ATTRS;
+ if (ace4_mask & ACE4_EXECUTE)
+ *acet_mask |= ACE_EXECUTE;
+ if (ace4_mask & ACE4_DELETE_CHILD)
+ *acet_mask |= ACE_DELETE_CHILD;
+ if (ace4_mask & ACE4_READ_ATTRIBUTES)
+ *acet_mask |= ACE_READ_ATTRIBUTES;
+ if (ace4_mask & ACE4_WRITE_ATTRIBUTES)
+ *acet_mask |= ACE_WRITE_ATTRIBUTES;
+ if (ace4_mask & ACE4_DELETE)
+ *acet_mask |= ACE_DELETE;
+ if (ace4_mask & ACE4_READ_ACL)
+ *acet_mask |= ACE_READ_ACL;
+ if (ace4_mask & ACE4_WRITE_ACL)
+ *acet_mask |= ACE_WRITE_ACL;
+ if (ace4_mask & ACE4_WRITE_OWNER)
+ *acet_mask |= ACE_WRITE_OWNER;
+ if (ace4_mask & ACE4_SYNCHRONIZE)
+ *acet_mask |= ACE_SYNCHRONIZE;
+}
+
+static void
+acet_mask_to_ace4_mask(uint32_t acet_mask, acemask4 *ace4_mask)
+{
+ *ace4_mask = 0;
+
+ if (acet_mask & ACE_READ_DATA)
+ *ace4_mask |= ACE4_READ_DATA;
+ if (acet_mask & ACE_WRITE_DATA)
+ *ace4_mask |= ACE4_WRITE_DATA;
+ if (acet_mask & ACE_APPEND_DATA)
+ *ace4_mask |= ACE_APPEND_DATA;
+ if (acet_mask & ACE4_READ_NAMED_ATTRS)
+ *ace4_mask |= ACE_READ_NAMED_ATTRS;
+ if (acet_mask & ACE_WRITE_NAMED_ATTRS)
+ *ace4_mask |= ACE4_WRITE_NAMED_ATTRS;
+ if (acet_mask & ACE_EXECUTE)
+ *ace4_mask |= ACE4_EXECUTE;
+ if (acet_mask & ACE_DELETE_CHILD)
+ *ace4_mask |= ACE4_DELETE_CHILD;
+ if (acet_mask & ACE_READ_ATTRIBUTES)
+ *ace4_mask |= ACE4_READ_ATTRIBUTES;
+ if (acet_mask & ACE_WRITE_ATTRIBUTES)
+ *ace4_mask |= ACE4_WRITE_ATTRIBUTES;
+ if (acet_mask & ACE_DELETE)
+ *ace4_mask |= ACE4_DELETE;
+ if (acet_mask & ACE_READ_ACL)
+ *ace4_mask |= ACE4_READ_ACL;
+ if (acet_mask & ACE_WRITE_ACL)
+ *ace4_mask |= ACE4_WRITE_ACL;
+ if (acet_mask & ACE_WRITE_OWNER)
+ *ace4_mask |= ACE4_WRITE_OWNER;
+ if (acet_mask & ACE_SYNCHRONIZE)
+ *ace4_mask |= ACE4_SYNCHRONIZE;
+}
+
+static void
+ace4_flags_to_acet_flags(aceflag4 ace4_flags, uint16_t *acet_flags)
+{
+ *acet_flags = 0;
+
+ if (ace4_flags & ACE4_FILE_INHERIT_ACE)
+ *acet_flags |= ACE_FILE_INHERIT_ACE;
+ if (ace4_flags & ACE4_DIRECTORY_INHERIT_ACE)
+ *acet_flags |= ACE_DIRECTORY_INHERIT_ACE;
+ if (ace4_flags & ACE4_NO_PROPAGATE_INHERIT_ACE)
+ *acet_flags |= ACE_NO_PROPAGATE_INHERIT_ACE;
+ if (ace4_flags & ACE4_INHERIT_ONLY_ACE)
+ *acet_flags |= ACE_INHERIT_ONLY_ACE;
+ if (ace4_flags & ACE4_SUCCESSFUL_ACCESS_ACE_FLAG)
+ *acet_flags |= ACE_SUCCESSFUL_ACCESS_ACE_FLAG;
+ if (ace4_flags & ACE4_FAILED_ACCESS_ACE_FLAG)
+ *acet_flags |= ACE_FAILED_ACCESS_ACE_FLAG;
+ if (ace4_flags & ACE4_IDENTIFIER_GROUP)
+ *acet_flags |= ACE_IDENTIFIER_GROUP;
+}
+
+static void
+acet_flags_to_ace4_flags(uint16_t acet_flags, aceflag4 *ace4_flags)
+{
+ *ace4_flags = 0;
+
+ if (acet_flags & ACE_FILE_INHERIT_ACE)
+ *ace4_flags |= ACE4_FILE_INHERIT_ACE;
+ if (acet_flags & ACE_DIRECTORY_INHERIT_ACE)
+ *ace4_flags |= ACE4_DIRECTORY_INHERIT_ACE;
+ if (acet_flags & ACE_NO_PROPAGATE_INHERIT_ACE)
+ *ace4_flags |= ACE4_NO_PROPAGATE_INHERIT_ACE;
+ if (acet_flags & ACE_INHERIT_ONLY_ACE)
+ *ace4_flags |= ACE4_INHERIT_ONLY_ACE;
+ if (acet_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG)
+ *ace4_flags |= ACE4_SUCCESSFUL_ACCESS_ACE_FLAG;
+ if (acet_flags & ACE_FAILED_ACCESS_ACE_FLAG)
+ *ace4_flags |= ACE4_FAILED_ACCESS_ACE_FLAG;
+ if (acet_flags & ACE_IDENTIFIER_GROUP)
+ *ace4_flags |= ACE4_IDENTIFIER_GROUP;
+}
+
int
vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
- uid_t owner, gid_t group, int isdir, int isserver, int just_count)
+ uid_t owner, gid_t group, int isserver, int just_count)
{
int error;
int i;
@@ -1865,7 +1976,7 @@ vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
for (i = 0; i < vs_ace4->vsa_aclcnt; i++) {
error = ace4_to_acet((nfsace4 *)(vs_ace4->vsa_aclentp) + i,
(ace_t *)(vs_acet->vsa_aclentp) + i, owner, group,
- isdir, isserver, just_count);
+ isserver, just_count);
if (error != 0)
goto out;
}
@@ -1879,7 +1990,7 @@ out:
int
vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
- int isdir, int isserver)
+ int isserver)
{
int error = 0;
int i;
@@ -1900,7 +2011,7 @@ vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
for (i = 0; i < vs_acet->vsa_aclcnt; i++) {
error = acet_to_ace4((ace_t *)(vs_acet->vsa_aclentp) + i,
- (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isdir, isserver);
+ (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isserver);
if (error != 0)
goto out;
}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index 6ef0000ea3..6169621a73 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -887,8 +887,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
if (error != 0)
break;
if (whichacl & _ACL_ACE_ENABLED) {
- error = vs_acet_to_ace4(&vs_native, &vs_ace4,
- vp->v_type == VDIR, TRUE);
+ error = vs_acet_to_ace4(&vs_native, &vs_ace4, TRUE);
vs_acet_destroy(&vs_native);
} else {
error = vs_aent_to_ace4(&vs_native, &vs_ace4,
@@ -968,8 +967,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
if (whichacl & _ACL_ACE_ENABLED) {
error = vs_ace4_to_acet(&vs_ace4, &vs_native,
- vap->va_uid, vap->va_gid, vp->v_type == VDIR, TRUE,
- FALSE);
+ vap->va_uid, vap->va_gid, TRUE, FALSE);
if (error != 0)
break;
(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d07cedb514..9ae1d0a56c 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -11982,7 +11982,7 @@ nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
* These are ace_t type entries.
*/
error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
- vp->v_type == VDIR, FALSE);
+ FALSE);
if (error)
return (error);
}
@@ -12151,7 +12151,7 @@ nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid,
- isdir, FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
+ FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
if (error)
return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
index 836297350a..1242f94e10 100644
--- a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc.
+ * Copyright 2005 Sun Microsystems, Inc.
* All rights reserved.
* Use is subject to license terms.
*/
@@ -68,6 +68,8 @@
#include <nfs/nfs_clnt.h>
#include <nfs/nfs_acl.h>
+#include <fs/fs_subr.h>
+
/*
* These are the interface routines for the server side of the
* NFS ACL server. See the NFS ACL protocol specification
@@ -95,6 +97,25 @@ acl2_getacl(GETACL2args *args, GETACL2res *resp, struct exportinfo *exi,
error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
+ if (error == ENOSYS) {
+ /*
+ * If the underlying file system doesn't support
+ * aclent_t type acls, fabricate an acl. This is
+ * required in order to to support existing clients
+ * that require the call to VOP_GETSECATTR to
+ * succeed while making the assumption that all
+ * file systems support aclent_t type acls. This
+ * causes problems for servers exporting ZFS file
+ * systems because ZFS supports ace_t type acls,
+ * and fails (with ENOSYS) when asked for aclent_t
+ * type acls.
+ *
+ * Note: if the fs_fab_acl() fails, we have other problems.
+ * This error should be returned to the caller.
+ */
+ error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+ }
+
if (error) {
VN_RELE(vp);
resp->status = puterrno(error);
@@ -454,6 +475,25 @@ acl3_getacl(GETACL3args *args, GETACL3res *resp, struct exportinfo *exi,
error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
+ if (error == ENOSYS) {
+ /*
+ * If the underlying file system doesn't support
+ * aclent_t type acls, fabricate an acl. This is
+ * required in order to to support existing clients
+ * that require the call to VOP_GETSECATTR to
+ * succeed while making the assumption that all
+ * file systems support aclent_t type acls. This
+ * causes problems for servers exporting ZFS file
+ * systems because ZFS supports ace_t type acls,
+ * and fails (with ENOSYS) when asked for aclent_t
+ * type acls.
+ *
+ * Note: if the fs_fab_acl() fails, we have other problems.
+ * This error should be returned to the caller.
+ */
+ error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+ }
+
if (error)
goto out;
diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c
index 79f486e9b1..844a3b7bb1 100644
--- a/usr/src/uts/common/fs/proc/prioctl.c
+++ b/usr/src/uts/common/fs/proc/prioctl.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -498,7 +498,7 @@ startover:
*/
t = pr_thread(pnp); /* returns locked thread */
thread_unlock(t);
- oprgetstatus(t, &un.prstat, VTOZ(vp));
+ oprgetstatus(t, &un.prstat, VTOZONE(vp));
prunlock(pnp);
if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
error = EFAULT;
@@ -835,7 +835,7 @@ startover:
break;
case PIOCSTATUS: /* get process/lwp status */
- oprgetstatus(t, &un.prstat, VTOZ(vp));
+ oprgetstatus(t, &un.prstat, VTOZONE(vp));
prunlock(pnp);
if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
error = EFAULT;
@@ -866,13 +866,13 @@ startover:
Bprsp = thing;
thing = NULL;
prsp = Bprsp;
- oprgetstatus(t, prsp, VTOZ(vp));
+ oprgetstatus(t, prsp, VTOZONE(vp));
t = p->p_tlist;
do {
ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
ASSERT(nlwp > 0);
--nlwp;
- oprgetstatus(t, ++prsp, VTOZ(vp));
+ oprgetstatus(t, ++prsp, VTOZONE(vp));
} while ((t = t->t_forw) != p->p_tlist);
ASSERT(nlwp == 0);
prunlock(pnp);
@@ -2053,7 +2053,7 @@ startover:
*/
t = pr_thread(pnp); /* returns locked thread */
thread_unlock(t);
- oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+ oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
prunlock(pnp);
if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
error = EFAULT;
@@ -2430,7 +2430,7 @@ startover:
error = EOVERFLOW;
break;
}
- oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+ oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
prunlock(pnp);
if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
error = EFAULT;
@@ -2471,13 +2471,13 @@ startover:
Bprsp = (prstatus32_t *)thing;
thing = NULL;
prsp = Bprsp;
- oprgetstatus32(t, prsp, VTOZ(vp));
+ oprgetstatus32(t, prsp, VTOZONE(vp));
t = p->p_tlist;
do {
ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
ASSERT(nlwp > 0);
--nlwp;
- oprgetstatus32(t, ++prsp, VTOZ(vp));
+ oprgetstatus32(t, ++prsp, VTOZONE(vp));
} while ((t = t->t_forw) != p->p_tlist);
ASSERT(nlwp == 0);
prunlock(pnp);
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index dea54056c6..d12ee64e8c 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -709,7 +709,7 @@ pr_read_status(prnode_t *pnp, uio_t *uiop)
*/
sp = kmem_alloc(sizeof (*sp), KM_SLEEP);
if ((error = prlock(pnp, ZNO)) == 0) {
- prgetstatus(pnp->pr_common->prc_proc, sp, VTOZ(PTOV(pnp)));
+ prgetstatus(pnp->pr_common->prc_proc, sp, VTOZONE(PTOV(pnp)));
prunlock(pnp);
error = pr_uioread(sp, sizeof (*sp), uiop);
}
@@ -753,7 +753,7 @@ pr_read_lstatus(prnode_t *pnp, uio_t *uiop)
if (ldp->ld_entry == NULL ||
(t = ldp->ld_entry->le_thread) == NULL)
continue;
- prgetlwpstatus(t, sp, VTOZ(PTOV(pnp)));
+ prgetlwpstatus(t, sp, VTOZONE(PTOV(pnp)));
sp = (lwpstatus_t *)((caddr_t)sp + LSPAN(lwpstatus_t));
}
prunlock(pnp);
@@ -1426,7 +1426,7 @@ pr_read_lwpstatus(prnode_t *pnp, uio_t *uiop)
goto out;
}
- prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+ prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
prunlock(pnp);
error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -1799,7 +1799,7 @@ pr_read_status_32(prnode_t *pnp, uio_t *uiop)
error = EOVERFLOW;
} else {
prgetstatus32(pnp->pr_common->prc_proc, sp,
- VTOZ(PTOV(pnp)));
+ VTOZONE(PTOV(pnp)));
prunlock(pnp);
error = pr_uioread(sp, sizeof (*sp), uiop);
}
@@ -1852,7 +1852,7 @@ pr_read_lstatus_32(prnode_t *pnp, uio_t *uiop)
if (ldp->ld_entry == NULL ||
(t = ldp->ld_entry->le_thread) == NULL)
continue;
- prgetlwpstatus32(t, sp, VTOZ(PTOV(pnp)));
+ prgetlwpstatus32(t, sp, VTOZONE(PTOV(pnp)));
sp = (lwpstatus32_t *)((caddr_t)sp + LSPAN32(lwpstatus32_t));
}
prunlock(pnp);
@@ -2471,7 +2471,7 @@ pr_read_lwpstatus_32(prnode_t *pnp, uio_t *uiop)
goto out;
}
- prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+ prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
prunlock(pnp);
error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -4281,9 +4281,9 @@ pr_lookup_ctdir(vnode_t *dp, char *comp)
* outside the zone. (see logic in contract_status_common)
*/
if ((ct->ct_owner != p) &&
- !(p == VTOZ(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
- VTOZ(dp)->zone_uniqid == contract_getzuniqid(ct) &&
- VTOZ(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
+ !(p == VTOZONE(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
+ VTOZONE(dp)->zone_uniqid == contract_getzuniqid(ct) &&
+ VTOZONE(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
ct->ct_czuniqid == GLOBAL_ZONEUNIQID)) {
prunlock(dpnp);
prfreenode(pnp);
@@ -4668,7 +4668,7 @@ pr_readdir_procdir(prnode_t *pnp, uio_t *uiop, int *eofp)
ASSERT(pnp->pr_type == PR_PROCDIR);
- zoneid = VTOZ(PTOV(pnp))->zone_id;
+ zoneid = VTOZONE(PTOV(pnp))->zone_id;
if ((error = gfs_readdir_init(&gstate, PNSIZ, PRSDSIZE, uiop,
PRROOTINO, PRROOTINO)) != 0)
@@ -5453,7 +5453,7 @@ pr_readdir_ctdir(prnode_t *pnp, uio_t *uiop, int *eofp)
return (error);
}
- zid = VTOZ(pnp->pr_vnode)->zone_uniqid;
+ zid = VTOZONE(pnp->pr_vnode)->zone_uniqid;
while ((error = gfs_readdir_pred(&gstate, uiop, &n)) == 0) {
id_t next = contract_plookup(p, n, zid);
if (next == -1) {
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 1e7793ba39..4d562852af 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -720,28 +720,37 @@ top:
vsec.vsa_dfaclcnt = 0;
vsec.vsa_dfaclentp = NULL;
vsec.vsa_mask = VSA_DFACLCNT;
- if (error = VOP_GETSECATTR(dvp, &vsec, 0, CRED())) {
+ error = VOP_GETSECATTR(dvp, &vsec, 0, CRED());
+ /*
+ * If error is ENOSYS then treat it as no error
+ * Don't want to force all file systems to support
+ * aclent_t style of ACL's.
+ */
+ if (error == ENOSYS)
+ error = 0;
+ if (error) {
if (*vpp != NULL)
VN_RELE(*vpp);
goto out;
- }
-
- /*
- * Apply the umask if no default ACLs.
- */
- if (vsec.vsa_dfaclcnt == 0)
- vap->va_mode &= ~umask;
+ } else {
+ /*
+ * Apply the umask if no default ACLs.
+ */
+ if (vsec.vsa_dfaclcnt == 0)
+ vap->va_mode &= ~umask;
- /*
- * VOP_GETSECATTR() may have allocated memory for ACLs we
- * didn't request, so double-check and free it if necessary.
- */
- if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
- kmem_free((caddr_t)vsec.vsa_aclentp,
- vsec.vsa_aclcnt * sizeof (aclent_t));
- if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
- kmem_free((caddr_t)vsec.vsa_dfaclentp,
- vsec.vsa_dfaclcnt * sizeof (aclent_t));
+ /*
+ * VOP_GETSECATTR() may have allocated memory for
+ * ACLs we didn't request, so double-check and
+ * free it if necessary.
+ */
+ if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
+ kmem_free((caddr_t)vsec.vsa_aclentp,
+ vsec.vsa_aclcnt * sizeof (aclent_t));
+ if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
+ kmem_free((caddr_t)vsec.vsa_dfaclentp,
+ vsec.vsa_dfaclcnt * sizeof (aclent_t));
+ }
}
/*
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
new file mode 100644
index 0000000000..0a6cc7b658
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -0,0 +1,1998 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * DVA-based Adjustable Relpacement Cache
+ *
+ * While much of the theory of operation and algorithms used here
+ * are based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slowes the flow of new data
+ * into the cache until we can make space avaiable.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory preasure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() inerface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2. We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the "top" state mutex must be held before the "bot" state mutex.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <vm/anon.h>
+#include <sys/fs/swapnode.h>
+#endif
+#include <sys/callb.h>
+
+static kmutex_t arc_reclaim_thr_lock;
+static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
+static uint8_t arc_thread_exit;
+
+typedef enum arc_reclaim_strategy {
+ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
+ ARC_RECLAIM_CONS /* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int arc_grow_retry = 60;
+
+static kmutex_t arc_reclaim_lock;
+static int arc_dead;
+
+/*
+ * Note that buffers can be on one of 5 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru_top - recently used, currently cached
+ * ARC_mru_bot - recentely used, no longer in cache
+ * ARC_mfu_top - frequently used, currently cached
+ * ARC_mfu_bot - frequently used, no longer in cache
+ * When there are no active references to the buffer, they
+ * are linked onto one of the lists in arc. These are the
+ * only buffers that can be evicted or deleted.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru_top
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru_top list.
+ */
+
+typedef struct arc_state {
+ list_t list; /* linked list of evictable buffer in state */
+ uint64_t lsize; /* total size of buffers in the linked list */
+ uint64_t size; /* total size of all buffers in this state */
+ uint64_t hits;
+ kmutex_t mtx;
+} arc_state_t;
+
+/* The 5 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru_top;
+static arc_state_t ARC_mru_bot;
+static arc_state_t ARC_mfu_top;
+static arc_state_t ARC_mfu_bot;
+
+static struct arc {
+ arc_state_t *anon;
+ arc_state_t *mru_top;
+ arc_state_t *mru_bot;
+ arc_state_t *mfu_top;
+ arc_state_t *mfu_bot;
+ uint64_t size; /* Actual total arc size */
+ uint64_t p; /* Target size (in bytes) of mru_top */
+ uint64_t c; /* Target size of cache (in bytes) */
+ uint64_t c_min; /* Minimum target cache size */
+ uint64_t c_max; /* Maximum target cache size */
+ uint64_t incr; /* Size by which to increment arc.c */
+ int64_t size_check;
+
+ /* performance stats */
+ uint64_t hits;
+ uint64_t misses;
+ uint64_t deleted;
+ uint64_t skipped;
+ uint64_t hash_elements;
+ uint64_t hash_elements_max;
+ uint64_t hash_collisions;
+ uint64_t hash_chains;
+ uint32_t hash_chain_max;
+
+ int no_grow; /* Don't try to grow cache size */
+} arc;
+
+/* Default amount to grow arc.incr */
+static int64_t arc_incr_size = 1024;
+
+/* > 0 ==> time to increment arc.c */
+static int64_t arc_size_check_default = -1000;
+
+static uint64_t arc_tempreserve;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ arc_done_func_t *acb_done;
+ void *acb_private;
+ arc_byteswap_func_t *acb_byteswap;
+ arc_buf_t *acb_buf;
+ zio_t *acb_zio_dummy;
+ arc_callback_t *acb_next;
+};
+
+struct arc_buf_hdr {
+ /* immutable */
+ uint64_t b_size;
+ spa_t *b_spa;
+
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ uint64_t b_cksum0;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_buf_t *b_buf;
+ uint32_t b_flags;
+
+ kcondvar_t b_cv;
+ arc_callback_t *b_acb;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ list_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ refcount_t b_refcnt;
+};
+
+/*
+ * Private ARC flags. These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read. However, these flags
+ * should never be passed and should only be set by ARC code. When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
+#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
+#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD 64
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(buf) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+ uintptr_t spav = (uintptr_t)spa;
+ uint8_t *vdva = (uint8_t *)dva;
+ uint64_t crc = -1ULL;
+ int i;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ for (i = 0; i < sizeof (dva_t); i++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+ crc ^= (spav>>8) ^ birth;
+
+ return (crc);
+}
+
+#define BUF_EMPTY(buf) \
+ ((buf)->b_dva.dva_word[0] == 0 && \
+ (buf)->b_dva.dva_word[1] == 0 && \
+ (buf)->b_birth == 0)
+
+#define BUF_EQUAL(spa, dva, birth, buf) \
+ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *buf;
+
+ mutex_enter(hash_lock);
+ for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+ buf = buf->b_hash_next) {
+ if (BUF_EQUAL(spa, dva, birth, buf)) {
+ *lockp = hash_lock;
+ return (buf);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
+static kthread_t *fbufs_lastthread;
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fbuf;
+ uint32_t max, i;
+
+ fbufs_lastthread = curthread;
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+ fbuf = fbuf->b_hash_next, i++) {
+ if (i < sizeof (fbufs) / sizeof (fbufs[0]))
+ fbufs[i] = fbuf;
+ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+ return (fbuf);
+ }
+
+ buf->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = buf;
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ atomic_add_64(&arc.hash_collisions, 1);
+ if (i == 1)
+ atomic_add_64(&arc.hash_chains, 1);
+ }
+ while (i > (max = arc.hash_chain_max) &&
+ max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
+ continue;
+ }
+ atomic_add_64(&arc.hash_elements, 1);
+ if (arc.hash_elements > arc.hash_elements_max)
+ atomic_add_64(&arc.hash_elements_max, 1);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+ arc_buf_hdr_t *fbuf, **bufp;
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+
+ bufp = &buf_hash_table.ht_table[idx];
+ while ((fbuf = *bufp) != buf) {
+ ASSERT(fbuf != NULL);
+ bufp = &fbuf->b_hash_next;
+ }
+ *bufp = buf->b_hash_next;
+ buf->b_hash_next = NULL;
+
+ /* collect some hash table performance data */
+ atomic_add_64(&arc.hash_elements, -1);
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ atomic_add_64(&arc.hash_chains, -1);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_hdr_t));
+ refcount_create(&buf->b_refcnt);
+ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ refcount_destroy(&buf->b_refcnt);
+ cv_destroy(&buf->b_cv);
+}
+
+void arc_kmem_reclaim(void);
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ arc_kmem_reclaim();
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 10;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4k block size. The table will take up
+ * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
+ * pointers).
+ */
+ while (hsize * 4096 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+
+ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+ 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+#define ARC_TAG (void *)0x05201962
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+ (ab->b_state != arc.anon)) {
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+ mutex_enter(&ab->b_state->mtx);
+ ASSERT(!refcount_is_zero(&ab->b_refcnt));
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&ab->b_state->list, ab);
+ ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+ ab->b_state->lsize -= ab->b_size;
+ mutex_exit(&ab->b_state->mtx);
+ }
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ (ab->b_state != arc.anon)) {
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+ mutex_enter(&ab->b_state->mtx);
+ ASSERT(!list_link_active(&ab->b_arc_node));
+ list_insert_head(&ab->b_state->list, ab);
+ ASSERT(ab->b_buf != NULL);
+ ab->b_state->lsize += ab->b_size;
+ mutex_exit(&ab->b_state->mtx);
+ }
+ return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
+ kmutex_t *hash_lock)
+{
+ arc_buf_t *buf;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcount_is_zero(&ab->b_refcnt)) {
+ if (ab->b_state != arc.anon) {
+ int drop_mutex = FALSE;
+
+ if (!MUTEX_HELD(&ab->b_state->mtx)) {
+ mutex_enter(&ab->b_state->mtx);
+ drop_mutex = TRUE;
+ }
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&ab->b_state->list, ab);
+ ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+ ab->b_state->lsize -= ab->b_size;
+ if (drop_mutex)
+ mutex_exit(&ab->b_state->mtx);
+ }
+ if (new_state != arc.anon) {
+ int drop_mutex = FALSE;
+
+ if (!MUTEX_HELD(&new_state->mtx)) {
+ mutex_enter(&new_state->mtx);
+ drop_mutex = TRUE;
+ }
+ list_insert_head(&new_state->list, ab);
+ ASSERT(ab->b_buf != NULL);
+ new_state->lsize += ab->b_size;
+ if (drop_mutex)
+ mutex_exit(&new_state->mtx);
+ }
+ }
+
+ ASSERT(!BUF_EMPTY(ab));
+ if (new_state == arc.anon && ab->b_state != arc.anon) {
+ buf_hash_remove(ab);
+ }
+
+ /*
+ * If this buffer isn't being transferred to the MRU-top
+ * state, it's safe to clear its prefetch flag
+ */
+ if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+ ab->b_flags &= ~ARC_PREFETCH;
+ }
+
+ buf = ab->b_buf;
+ if (buf == NULL) {
+ ASSERT3U(ab->b_state->size, >=, ab->b_size);
+ atomic_add_64(&ab->b_state->size, -ab->b_size);
+ /* we should only be here if we are deleting state */
+ ASSERT(new_state == arc.anon &&
+ (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
+ } else while (buf) {
+ ASSERT3U(ab->b_state->size, >=, ab->b_size);
+ atomic_add_64(&ab->b_state->size, -ab->b_size);
+ atomic_add_64(&new_state->size, ab->b_size);
+ buf = buf->b_next;
+ }
+ ab->b_state = new_state;
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_spa = spa;
+ hdr->b_state = arc.anon;
+ hdr->b_arc_access = 0;
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_next = NULL;
+ buf->b_data = zio_buf_alloc(size);
+ hdr->b_buf = buf;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ (void) refcount_add(&hdr->b_refcnt, tag);
+
+ atomic_add_64(&arc.size, size);
+ atomic_add_64(&arc.anon->size, size);
+
+ return (buf);
+}
+
+static void
+arc_hdr_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT3P(hdr->b_state, ==, arc.anon);
+
+ if (!BUF_EMPTY(hdr)) {
+ /*
+ * We can be called with an arc state lock held,
+ * so we can't hold a hash lock here.
+ * ASSERT(not in hash table)
+ */
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ }
+ if (hdr->b_buf) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ ASSERT3U(hdr->b_size, >, 0);
+ zio_buf_free(buf->b_data, hdr->b_size);
+ atomic_add_64(&arc.size, -hdr->b_size);
+ ASSERT3U(arc.anon->size, >=, hdr->b_size);
+ atomic_add_64(&arc.anon->size, -hdr->b_size);
+ ASSERT3P(buf->b_next, ==, NULL);
+ kmem_cache_free(buf_cache, buf);
+ hdr->b_buf = NULL;
+ }
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ ASSERT3P(hdr->b_acb, ==, NULL);
+ kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int freeable;
+
+ mutex_enter(hash_lock);
+ if (remove_reference(hdr, hash_lock, tag) > 0) {
+ arc_buf_t **bufp = &hdr->b_buf;
+ arc_state_t *state = hdr->b_state;
+ uint64_t size = hdr->b_size;
+
+ ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
+ while (*bufp != buf) {
+ ASSERT(*bufp);
+ bufp = &(*bufp)->b_next;
+ }
+ *bufp = buf->b_next;
+ mutex_exit(hash_lock);
+ zio_buf_free(buf->b_data, size);
+ atomic_add_64(&arc.size, -size);
+ kmem_cache_free(buf_cache, buf);
+ ASSERT3U(state->size, >=, size);
+ atomic_add_64(&state->size, -size);
+ return;
+ }
+
+ /* don't free buffers that are in the middle of an async write */
+ freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
+ mutex_exit(hash_lock);
+
+ if (freeable)
+ arc_hdr_free(hdr);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes. Move the removed buffers to the appropriate evict state.
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, int64_t bytes)
+{
+ arc_state_t *evicted_state;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+
+ ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+ if (state == arc.mru_top)
+ evicted_state = arc.mru_bot;
+ else
+ evicted_state = arc.mfu_bot;
+
+ mutex_enter(&state->mtx);
+ mutex_enter(&evicted_state->mtx);
+
+ for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->list, ab);
+ hash_lock = HDR_LOCK(ab);
+ if (mutex_tryenter(hash_lock)) {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ arc_change_state(evicted_state, ab, hash_lock);
+ zio_buf_free(ab->b_buf->b_data, ab->b_size);
+ atomic_add_64(&arc.size, -ab->b_size);
+ ASSERT3P(ab->b_buf->b_next, ==, NULL);
+ kmem_cache_free(buf_cache, ab->b_buf);
+ ab->b_buf = NULL;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ bytes_evicted += ab->b_size;
+ mutex_exit(hash_lock);
+ if (bytes_evicted >= bytes)
+ break;
+ } else {
+ atomic_add_64(&arc.skipped, 1);
+ }
+ }
+ mutex_exit(&evicted_state->mtx);
+ mutex_exit(&state->mtx);
+
+ if (bytes_evicted < bytes)
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+
+ return (bytes_evicted);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes. Destroy the buffers that are removed.
+ */
+static void
+arc_delete_state(arc_state_t *state, int64_t bytes)
+{
+ uint_t bufs_skipped = 0;
+ uint64_t bytes_deleted = 0;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+
+top:
+ mutex_enter(&state->mtx);
+ for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->list, ab);
+ hash_lock = HDR_LOCK(ab);
+ if (mutex_tryenter(hash_lock)) {
+ arc_change_state(arc.anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ atomic_add_64(&arc.deleted, 1);
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+ bytes_deleted += ab->b_size;
+ arc_hdr_free(ab);
+ if (bytes >= 0 && bytes_deleted >= bytes)
+ break;
+ } else {
+ if (bytes < 0) {
+ mutex_exit(&state->mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ bufs_skipped += 1;
+ }
+ }
+ mutex_exit(&state->mtx);
+
+ if (bufs_skipped) {
+ atomic_add_64(&arc.skipped, bufs_skipped);
+ ASSERT(bytes >= 0);
+ }
+
+ if (bytes_deleted < bytes)
+ dprintf("only deleted %lld bytes from %p",
+ (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+ int64_t top_sz, mru_over, arc_over;
+
+ top_sz = arc.anon->size + arc.mru_top->size;
+
+ if (top_sz > arc.p && arc.mru_top->lsize > 0) {
+ int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
+ (void) arc_evict_state(arc.mru_top, toevict);
+ top_sz = arc.anon->size + arc.mru_top->size;
+ }
+
+ mru_over = top_sz + arc.mru_bot->size - arc.c;
+
+ if (mru_over > 0) {
+ if (arc.mru_bot->lsize > 0) {
+ int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
+ arc_delete_state(arc.mru_bot, todelete);
+ }
+ }
+
+ if ((arc_over = arc.size - arc.c) > 0) {
+ int64_t table_over;
+
+ if (arc.mfu_top->lsize > 0) {
+ int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
+ (void) arc_evict_state(arc.mfu_top, toevict);
+ }
+
+ table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
+ - arc.c*2;
+
+ if (table_over > 0 && arc.mfu_bot->lsize > 0) {
+ int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
+ arc_delete_state(arc.mfu_bot, todelete);
+ }
+ }
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(void)
+{
+ arc_delete_state(arc.mru_top, -1);
+ arc_delete_state(arc.mfu_top, -1);
+
+ arc_delete_state(arc.mru_bot, -1);
+ arc_delete_state(arc.mfu_bot, -1);
+}
+
+void
+arc_kmem_reclaim(void)
+{
+ /* Remove 6.25% */
+ /*
+ * We need arc_reclaim_lock because we don't want multiple
+ * threads trying to reclaim concurrently.
+ */
+
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini(). So we set a flag to prevent
+ * accessing the destroyed mutexes and lists.
+ */
+ if (arc_dead)
+ return;
+
+ mutex_enter(&arc_reclaim_lock);
+
+ atomic_add_64(&arc.c, -(arc.c >> 4));
+ if (arc.c < arc.c_min)
+ arc.c = arc.c_min;
+ atomic_add_64(&arc.p, -(arc.p >> 4));
+
+ arc_adjust();
+
+ /* Cool it for a while */
+ arc.incr = 0;
+ arc.size_check = arc_size_check_default << 3;
+
+ mutex_exit(&arc_reclaim_lock);
+}
+
+static int
+arc_reclaim_needed(void)
+{
+ uint64_t extra;
+
+#ifdef _KERNEL
+ /*
+ * take 'desfree' extra pages, so we reclaim sooner, rather than later
+ */
+ extra = desfree;
+
+ /*
+ * check that we're out of range of the pageout scanner. It starts to
+ * schedule paging if freemem is less than lotsfree and needfree.
+ * lotsfree is the high-water mark for pageout, and needfree is the
+ * number of needed free pages. We add extra pages here to make sure
+ * the scanner doesn't start up while we're freeing memory.
+ */
+ if (freemem < lotsfree + needfree + extra)
+ return (1);
+
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+ return (1);
+
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * free)
+ */
+#if defined(__i386)
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+ return (1);
+#endif
+
+#else
+ if (spa_get_random(100) == 0)
+ return (1);
+#endif
+ return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+
+ /*
+ * an agressive reclamation will shrink the cache size as well as reap
+ * free kmem buffers. The arc_kmem_reclaim function is called when the
+ * header-cache is reaped, so we only reap the header cache if we're
+ * performing an agressive reclaim. If we're not, just clean the kmem
+ * buffer caches.
+ */
+ if (strat == ARC_RECLAIM_AGGR)
+ kmem_cache_reap_now(hdr_cache);
+
+ kmem_cache_reap_now(buf_cache);
+
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ }
+}
+
+static void
+arc_reclaim_thread(void)
+{
+ clock_t growtime = 0;
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ while (arc_thread_exit == 0) {
+ if (arc_reclaim_needed()) {
+
+ if (arc.no_grow) {
+ if (last_reclaim == ARC_RECLAIM_CONS) {
+ last_reclaim = ARC_RECLAIM_AGGR;
+ } else {
+ last_reclaim = ARC_RECLAIM_CONS;
+ }
+ } else {
+ arc.no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ membar_producer();
+ }
+
+ /* reset the growth delay for every reclaim */
+ growtime = lbolt + (arc_grow_retry * hz);
+
+ arc_kmem_reap_now(last_reclaim);
+
+ } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
+ arc.no_grow = FALSE;
+ }
+
+ /* block until needed, or one second, whichever is shorter */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thr_cv,
+ &arc_reclaim_thr_lock, (lbolt + hz));
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ }
+
+ arc_thread_exit = 0;
+ cv_broadcast(&arc_reclaim_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ thread_exit();
+}
+
+static void
+arc_try_grow(int64_t bytes)
+{
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ atomic_add_64((uint64_t *)&arc.size_check, 1);
+
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thr_cv);
+ return;
+ }
+
+ if (arc.no_grow)
+ return;
+
+ /*
+ * return true if we successfully grow, or if there's enough space that
+ * we don't have to grow. Above, we return false if we can't grow, or
+ * if we shouldn't because a reclaim is in progress.
+ */
+ if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
+ if (arc.size_check > 0) {
+ arc.size_check = arc_size_check_default;
+ atomic_add_64(&arc.incr, arc_incr_size);
+ }
+ atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+ if (arc.c > arc.c_max)
+ arc.c = arc.c_max;
+ else
+ atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+ } else if (arc.size > arc.c) {
+ if (arc.size_check > 0) {
+ arc.size_check = arc_size_check_default;
+ atomic_add_64(&arc.incr, arc_incr_size);
+ }
+ atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+ if (arc.c > arc.c_max)
+ arc.c = arc.c_max;
+ else
+ atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+ }
+}
+
+/*
+ * check if the cache has reached its limits and eviction is required prior to
+ * insert. In this situation, we want to evict if no_grow is set Otherwise, the
+ * cache is either big enough that we can insert, or a arc_try_grow will result
+ * in more space being made available.
+ */
+
+static int
+arc_evict_needed()
+{
+
+ if (arc_reclaim_needed())
+ return (1);
+
+ if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * The state, supplied as the first argument, is going to have something
+ * inserted on its behalf. So, determine which cache must be victimized to
+ * satisfy an insertion for this state. We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead. Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU. In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * MFU's resident set is consuming more space than it has been allotted. In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_evict_for_state(arc_state_t *state, uint64_t bytes)
+{
+ uint64_t mru_used;
+ uint64_t mfu_space;
+ uint64_t evicted;
+
+ ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+ if (state == arc.mru_top) {
+ mru_used = arc.anon->size + arc.mru_top->size;
+ if (arc.p > mru_used) {
+ /* case 1 */
+ evicted = arc_evict_state(arc.mfu_top, bytes);
+ if (evicted < bytes) {
+ arc_adjust();
+ }
+ } else {
+ /* case 2 */
+ evicted = arc_evict_state(arc.mru_top, bytes);
+ if (evicted < bytes) {
+ arc_adjust();
+ }
+ }
+ } else {
+ /* MFU_top case */
+ mfu_space = arc.c - arc.p;
+ if (mfu_space > arc.mfu_top->size) {
+ /* case 3 */
+ evicted = arc_evict_state(arc.mru_top, bytes);
+ if (evicted < bytes) {
+ arc_adjust();
+ }
+ } else {
+ /* case 4 */
+ evicted = arc_evict_state(arc.mfu_top, bytes);
+ if (evicted < bytes) {
+ arc_adjust();
+ }
+ }
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+ int blksz, mult;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ blksz = buf->b_size;
+
+ if (buf->b_state == arc.anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ arc_try_grow(blksz);
+ if (arc_evict_needed()) {
+ arc_evict_for_state(arc.mru_top, blksz);
+ }
+
+ ASSERT(buf->b_arc_access == 0);
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
+ buf);
+ arc_change_state(arc.mru_top, buf, hash_lock);
+
+ /*
+ * If we are using less than 2/3 of our total target
+ * cache size, bump up the target size for the MRU
+ * list.
+ */
+ if (arc.size < arc.c*2/3) {
+ arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
+ }
+
+ } else if (buf->b_state == arc.mru_top) {
+ /*
+ * If this buffer is in the MRU-top state and has the prefetch
+ * flag, the first read was actually part of a prefetch. In
+ * this situation, we simply want to clear the flag and return.
+ * A subsequent access should bump this into the MFU state.
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ buf->b_flags &= ~ARC_PREFETCH;
+ atomic_add_64(&arc.mru_top->hits, 1);
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu_top,
+ arc_buf_hdr_t *, buf);
+ arc_change_state(arc.mfu_top, buf, hash_lock);
+ }
+ atomic_add_64(&arc.mru_top->hits, 1);
+ } else if (buf->b_state == arc.mru_bot) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ new_state = arc.mru_top;
+ DTRACE_PROBE1(new_state__mru_top,
+ arc_buf_hdr_t *, buf);
+ } else {
+ new_state = arc.mfu_top;
+ DTRACE_PROBE1(new_state__mfu_top,
+ arc_buf_hdr_t *, buf);
+ }
+
+ arc_try_grow(blksz);
+ if (arc_evict_needed()) {
+ arc_evict_for_state(new_state, blksz);
+ }
+
+ /* Bump up the target size of the MRU list */
+ mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
+ 1 : (arc.mfu_bot->size/arc.mru_bot->size));
+ arc.p = MIN(arc.c, arc.p + blksz * mult);
+
+ buf->b_arc_access = lbolt;
+ arc_change_state(new_state, buf, hash_lock);
+
+ atomic_add_64(&arc.mru_bot->hits, 1);
+ } else if (buf->b_state == arc.mfu_top) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: the add_reference() that occurred when we did
+ * the arc_read() should have kicked this off the list,
+ * so even if it was a prefetch, it will be put back at
+ * the head of the list when we remove_reference().
+ */
+ atomic_add_64(&arc.mfu_top->hits, 1);
+ } else if (buf->b_state == arc.mfu_bot) {
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ arc_try_grow(blksz);
+ if (arc_evict_needed()) {
+ arc_evict_for_state(arc.mfu_top, blksz);
+ }
+
+ /* Bump up the target size for the MFU list */
+ mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
+ 1 : (arc.mru_bot->size/arc.mfu_bot->size));
+ arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu_top,
+ arc_buf_hdr_t *, buf);
+ arc_change_state(arc.mfu_top, buf, hash_lock);
+
+ atomic_add_64(&arc.mfu_bot->hits, 1);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ arc_buf_free(buf, arg);
+}
+
+/* a generic arc_done_func_t which you can use */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (zio && zio->io_error) {
+ arc_buf_free(buf, arg);
+ *bufp = NULL;
+ } else {
+ *bufp = buf;
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ arc_buf_t *abuf; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock;
+ arc_callback_t *callback_list, *acb;
+ int freeable = FALSE;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+
+ if (!HDR_FREED_IN_READ(hdr)) {
+ arc_buf_hdr_t *found;
+
+ found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+ &hash_lock);
+
+ /*
+ * Buffer was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O.
+ */
+
+ ASSERT(found);
+ ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
+ }
+
+ /* byteswap if necessary */
+ callback_list = hdr->b_acb;
+ ASSERT(callback_list != NULL);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+ callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+ /* create copies of the data buffer for the callers */
+ abuf = buf;
+ for (acb = callback_list; acb; acb = acb->acb_next) {
+ if (acb->acb_done) {
+ if (abuf == NULL) {
+ abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ abuf->b_data = zio_buf_alloc(hdr->b_size);
+ atomic_add_64(&arc.size, hdr->b_size);
+ bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+ abuf->b_hdr = hdr;
+ abuf->b_next = hdr->b_buf;
+ hdr->b_buf = abuf;
+ atomic_add_64(&hdr->b_state->size, hdr->b_size);
+ }
+ acb->acb_buf = abuf;
+ abuf = NULL;
+ } else {
+ /*
+ * The caller did not provide a callback function.
+ * In this case, we should just remove the reference.
+ */
+ if (HDR_FREED_IN_READ(hdr)) {
+ ASSERT3P(hdr->b_state, ==, arc.anon);
+ (void) refcount_remove(&hdr->b_refcnt,
+ acb->acb_private);
+ } else {
+ (void) remove_reference(hdr, hash_lock,
+ acb->acb_private);
+ }
+ }
+ }
+ hdr->b_acb = NULL;
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+ if (zio->io_error != 0) {
+ hdr->b_flags |= ARC_IO_ERROR;
+ if (hdr->b_state != arc.anon)
+ arc_change_state(arc.anon, hdr, hash_lock);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ if (!HDR_FREED_IN_READ(hdr)) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ if (zio->io_error == 0 && hdr->b_state == arc.anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_state, ==, arc.anon);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ cv_broadcast(&hdr->b_cv);
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done)
+ acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_free(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t arc_flags)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ zio_t *rzio;
+
+top:
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (hdr && hdr->b_buf) {
+
+ ASSERT((hdr->b_state == arc.mru_top) ||
+ (hdr->b_state == arc.mfu_top) ||
+ ((hdr->b_state == arc.anon) &&
+ (HDR_IO_IN_PROGRESS(hdr))));
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+
+ if ((arc_flags & ARC_NOWAIT) && done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, flags);
+
+ ASSERT(acb->acb_done != NULL);
+ acb->acb_next = hdr->b_acb;
+ hdr->b_acb = acb;
+ add_reference(hdr, hash_lock, private);
+ mutex_exit(hash_lock);
+ return (0);
+ } else if (arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ /*
+ * If there is already a reference on this block, create
+ * a new copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
+
+ if (done)
+ add_reference(hdr, hash_lock, private);
+ if (done && refcount_count(&hdr->b_refcnt) > 1) {
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_data = zio_buf_alloc(hdr->b_size);
+ ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
+ atomic_add_64(&arc.size, hdr->b_size);
+ bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
+ buf->b_hdr = hdr;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ atomic_add_64(&hdr->b_state->size, hdr->b_size);
+ } else {
+ buf = hdr->b_buf;
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ atomic_add_64(&arc.hits, 1);
+ if (done)
+ done(NULL, buf, private);
+ } else {
+ uint64_t size = BP_GET_LSIZE(bp);
+ arc_callback_t *acb;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists;
+
+ buf = arc_buf_alloc(spa, size, private);
+ hdr = buf->b_hdr;
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = bp->blk_birth;
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ arc_buf_free(buf, private);
+ goto top; /* restart the IO request */
+ }
+
+ } else {
+ /* this block is in the ghost cache */
+ ASSERT((hdr->b_state == arc.mru_bot) ||
+ (hdr->b_state == arc.mfu_bot));
+ add_reference(hdr, hash_lock, private);
+
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_data = zio_buf_alloc(hdr->b_size);
+ atomic_add_64(&arc.size, hdr->b_size);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ buf->b_hdr = hdr;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ }
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+
+ ASSERT(hdr->b_acb == NULL);
+ hdr->b_acb = acb;
+
+ /*
+ * If this DVA is part of a prefetch, mark the buf
+ * header with the prefetch flag
+ */
+ if (arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+ /*
+ * If the buffer has been evicted, migrate it to a present state
+ * before issuing the I/O. Once we drop the hash-table lock,
+ * the header will be marked as I/O in progress and have an
+ * attached buffer. At this point, anybody who finds this
+ * buffer ought to notice that it's legit but has a pending I/O.
+ */
+
+ if ((hdr->b_state == arc.mru_bot) ||
+ (hdr->b_state == arc.mfu_bot))
+ arc_access(hdr, hash_lock);
+
+ mutex_exit(hash_lock);
+
+ ASSERT3U(hdr->b_size, ==, size);
+ DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
+ uint64_t, size);
+ atomic_add_64(&arc.misses, 1);
+ rzio = zio_read(pio, spa, bp, buf->b_data, size,
+ arc_read_done, buf, priority, flags);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal. If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_mtx;
+ int rc = 0;
+
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+ if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
+ bcopy(hdr->b_buf->b_data, data, hdr->b_size);
+ else
+ rc = ENOENT;
+
+ if (hash_mtx)
+ mutex_exit(hash_mtx);
+
+ return (rc);
+}
+
+/*
+ * Release this buffer from the cache. This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+ if (hdr->b_state == arc.anon) {
+ /* this buffer is already released */
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ ASSERT(BUF_EMPTY(hdr));
+ return;
+ }
+
+ mutex_enter(hash_lock);
+
+ if (refcount_count(&hdr->b_refcnt) > 1) {
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t **bufp;
+ uint64_t blksz = hdr->b_size;
+ spa_t *spa = hdr->b_spa;
+
+ /*
+ * Pull the data off of this buf and attach it to
+ * a new anonymous buf.
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf) {
+ ASSERT(*bufp);
+ bufp = &(*bufp)->b_next;
+ }
+ *bufp = (*bufp)->b_next;
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+ mutex_exit(hash_lock);
+
+ nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr->b_size = blksz;
+ nhdr->b_spa = spa;
+ nhdr->b_buf = buf;
+ nhdr->b_state = arc.anon;
+ nhdr->b_arc_access = 0;
+ nhdr->b_flags = 0;
+ buf->b_hdr = nhdr;
+ buf->b_next = NULL;
+ (void) refcount_add(&nhdr->b_refcnt, tag);
+ atomic_add_64(&arc.anon->size, blksz);
+
+ hdr = nhdr;
+ } else {
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ arc_change_state(arc.anon, hdr, hash_lock);
+ hdr->b_arc_access = 0;
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ }
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_state == arc.anon);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_buf_t *buf;
+ arc_buf_hdr_t *hdr;
+ arc_callback_t *acb;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+ acb = hdr->b_acb;
+ hdr->b_acb = NULL;
+
+ /* this buffer is on no lists and is not in the hash table */
+ ASSERT3P(hdr->b_state, ==, arc.anon);
+
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = zio->io_bp->blk_birth;
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ /* clear the "in-write" flag */
+ hdr->b_hash_next = NULL;
+ /* This write may be all-zero */
+ if (!BUF_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+ BP_IDENTITY(zio->io_bp)));
+ ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+ zio->io_bp->blk_birth);
+
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc.anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_free(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ }
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ }
+ if (acb && acb->acb_done) {
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ acb->acb_done(zio, buf, acb->acb_private);
+ }
+
+ if (acb)
+ kmem_free(acb, sizeof (arc_callback_t));
+}
+
+int
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t arc_flags)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_callback_t *acb;
+ zio_t *rzio;
+
+ /* this is a private buffer - no locking required */
+ ASSERT3P(hdr->b_state, ==, arc.anon);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(!HDR_IO_ERROR(hdr));
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = (arc_byteswap_func_t *)-1;
+ hdr->b_acb = acb;
+ rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+ buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+
+ return (0);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+ arc_buf_hdr_t *ab;
+ kmutex_t *hash_lock;
+ zio_t *zio;
+
+ /*
+ * If this buffer is in the cache, release it, so it
+ * can be re-used.
+ */
+ ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (ab != NULL) {
+ /*
+ * The checksum of blocks to free is not always
+ * preserved (eg. on the deadlist). However, if it is
+ * nonzero, it should match what we have in the cache.
+ */
+ ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+ ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ arc_change_state(arc.anon, ab, hash_lock);
+ if (refcount_is_zero(&ab->b_refcnt)) {
+ mutex_exit(hash_lock);
+ arc_hdr_free(ab);
+ atomic_add_64(&arc.deleted, 1);
+ } else {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+ if (HDR_IO_IN_PROGRESS(ab))
+ ab->b_flags |= ARC_FREED_IN_READ;
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ mutex_exit(hash_lock);
+ }
+ }
+
+ zio = zio_free(pio, spa, txg, bp, done, private);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(zio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(zio);
+
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t tempreserve)
+{
+ atomic_add_64(&arc_tempreserve, -tempreserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t tempreserve)
+{
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (ERESTART);
+ }
+#endif
+ /*
+ * XXX This is kind of hacky. The limit should be adjusted
+ * dynamically to keep the time to sync a dataset fixed (around
+ * 1-5 seconds?).
+ * Maybe should have some sort of locking? If two requests come
+ * in concurrently, we might let them both succeed, when one of
+ * them should fail. Not a huge deal.
+ */
+
+ ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */
+
+ if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
+ "tempreserve=%lluK arc.c=%lluK\n",
+ arc_tempreserve>>10, arc.anon->lsize>>10,
+ tempreserve>>10, arc.c>>10);
+ return (ERESTART);
+ }
+ atomic_add_64(&arc_tempreserve, tempreserve);
+ return (0);
+}
+
+void
+arc_init(void)
+{
+ mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Start out with 1/8 of all memory */
+ arc.c = physmem * PAGESIZE / 8;
+
+#ifdef _KERNEL
+ /*
+ * On architectures where the physical memory can be larger
+ * than the addressable space (intel in 32-bit mode), we may
+ * need to limit the cache to 1/8 of VM size.
+ */
+ arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+
+ /* use at least 1/32 of all memory, or 32MB, whichever is more */
+ arc.c_min = MAX(arc.c / 4, 64<<20);
+ /* use at most 3/4 of all memory, or all but 1GB, whichever is more */
+ if (arc.c * 8 >= 1<<30)
+ arc.c_max = (arc.c * 8) - (1<<30);
+ else
+ arc.c_max = arc.c_min;
+ arc.c_max = MAX(arc.c * 6, arc.c_max);
+ arc.c = arc.c_max;
+ arc.p = (arc.c >> 1);
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc.c = arc.c / 2;
+ if (arc.c < arc.c_min)
+ arc.c = arc.c_min;
+
+ arc.anon = &ARC_anon;
+ arc.mru_top = &ARC_mru_top;
+ arc.mru_bot = &ARC_mru_bot;
+ arc.mfu_top = &ARC_mfu_top;
+ arc.mfu_bot = &ARC_mfu_bot;
+
+ list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+
+ buf_init();
+
+ arc_thread_exit = 0;
+
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+arc_fini(void)
+{
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_thread_exit = 1;
+ while (arc_thread_exit != 0)
+ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ arc_flush();
+
+ arc_dead = TRUE;
+
+ mutex_destroy(&arc_reclaim_lock);
+ mutex_destroy(&arc_reclaim_thr_lock);
+ cv_destroy(&arc_reclaim_thr_cv);
+
+ list_destroy(&arc.mru_top->list);
+ list_destroy(&arc.mru_bot->list);
+ list_destroy(&arc.mfu_top->list);
+ list_destroy(&arc.mfu_bot->list);
+
+ buf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
new file mode 100644
index 0000000000..68f79ac5a2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static void
+bplist_hold(bplist_t *bpl)
+{
+ ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+ if (bpl->bpl_dbuf == NULL) {
+ bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
+ bpl->bpl_object, bpl);
+ dmu_buf_read(bpl->bpl_dbuf);
+ bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+ }
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+ uint64_t obj;
+
+ obj = dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+ DMU_OT_BPLIST_HDR, sizeof (bplist_phys_t), tx);
+
+ return (obj);
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+void
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ VERIFY(dmu_object_info(mos, object, &doi) == 0);
+
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_dbuf == NULL);
+ ASSERT(bpl->bpl_phys == NULL);
+ ASSERT(bpl->bpl_cached_dbuf == NULL);
+ ASSERT(bpl->bpl_queue == NULL);
+ ASSERT(object != 0);
+
+ bpl->bpl_mos = mos;
+ bpl->bpl_object = object;
+ bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+ bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_queue == NULL);
+
+ if (bpl->bpl_cached_dbuf) {
+ dmu_buf_rele(bpl->bpl_cached_dbuf);
+ bpl->bpl_cached_dbuf = NULL;
+ }
+ if (bpl->bpl_dbuf) {
+ dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+ bpl->bpl_dbuf = NULL;
+ bpl->bpl_phys = NULL;
+ }
+
+ mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+ boolean_t rv;
+
+ if (bpl->bpl_object == 0)
+ return (B_TRUE);
+
+ mutex_enter(&bpl->bpl_lock);
+ bplist_hold(bpl);
+ rv = (bpl->bpl_phys->bpl_entries == 0);
+ mutex_exit(&bpl->bpl_lock);
+
+ return (rv);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ dmu_buf_t *db;
+
+ mutex_enter(&bpl->bpl_lock);
+ bplist_hold(bpl);
+
+ if (*itorp >= bpl->bpl_phys->bpl_entries) {
+ mutex_exit(&bpl->bpl_lock);
+ return (ENOENT);
+ }
+
+ blk = *itorp >> bpl->bpl_bpshift;
+ off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+ db = bpl->bpl_cached_dbuf;
+
+ if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+ if (db != NULL)
+ dmu_buf_rele(db);
+ bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blk << bpl->bpl_blockshift);
+ }
+
+ ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+ dmu_buf_read(db);
+ bparray = db->db_data;
+ *bp = bparray[off];
+ (*itorp)++;
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+void
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ dmu_buf_t *db;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ bplist_hold(bpl);
+
+ blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+ off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+ db = bpl->bpl_cached_dbuf;
+
+ if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+ if (db != NULL)
+ dmu_buf_rele(db);
+ bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blk << bpl->bpl_blockshift);
+ }
+
+ ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+ dmu_buf_will_dirty(db, tx);
+ bparray = db->db_data;
+ bparray[off] = *bp;
+
+ /* We never need the fill count. */
+ bparray[off].blk_fill = 0;
+
+ /* The bplist will compress better if we can leave off the checksum */
+ bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ bpl->bpl_phys->bpl_entries++;
+ bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
+ mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+ bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ bpq->bpq_blk = *bp;
+ bpq->bpq_next = bpl->bpl_queue;
+ bpl->bpl_queue = bpq;
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+ bplist_q_t *bpq;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpq = bpl->bpl_queue) != NULL) {
+ bpl->bpl_queue = bpq->bpq_next;
+ mutex_exit(&bpl->bpl_lock);
+ bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+ kmem_free(bpq, sizeof (*bpq));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+ mutex_enter(&bpl->bpl_lock);
+ ASSERT3P(bpl->bpl_queue, ==, NULL);
+ bplist_hold(bpl);
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+ bpl->bpl_phys->bpl_entries = 0;
+ bpl->bpl_phys->bpl_bytes = 0;
+ mutex_exit(&bpl->bpl_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 0000000000..e4b2d7f9e6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2022 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static void dbuf_verify(dmu_buf_impl_t *db);
+static void dbuf_evict_user(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static arc_done_func_t dbuf_read_done;
+static arc_done_func_t dbuf_write_done;
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+taskq_t *dbuf_tq;
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ refcount_create(&db->db_holds);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+ return (crc);
+}
+
+#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (!refcount_is_zero(&db->db_holds)) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid = db->db_blkid;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (!refcount_is_zero(&dbf->db_holds)) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, 1);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ /*
+ * We musn't hold db_mtx to maintin lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_dnode != NULL);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static int dbuf_evictable(dmu_buf_impl_t *db);
+static void dbuf_clear(dmu_buf_impl_t *db);
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ int err;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ err = dbuf_evictable(db);
+ ASSERT(err == TRUE);
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
+ return;
+
+ if (db->db_d.db_user_data_ptr_ptr)
+ *db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+ db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
+ db->db_d.db_user_ptr = NULL;
+ db->db_d.db_user_data_ptr_ptr = NULL;
+ db->db_d.db_evict_func = NULL;
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 64k block size. The table will take up
+ * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
+ * pointers).
+ */
+ while (hsize * 65536 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+
+ dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+ dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ taskq_destroy(dbuf_tq);
+ dbuf_tq = NULL;
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+ int i;
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT(list_head(&dn->dn_dbufs));
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ }
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ if (db->db_level == 0) {
+ void **udpp = db->db_d.db_user_data_ptr_ptr;
+ /* we can be momentarily larger in dnode_set_blksz() */
+ if (db->db_blkid != DB_BONUS_BLKID && dn) {
+ ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+ }
+ if (udpp) {
+ ASSERT((refcount_is_zero(&db->db_holds) &&
+ *udpp == NULL) ||
+ (!refcount_is_zero(&db->db_holds) &&
+ *udpp == db->db.db_data));
+ }
+
+ if (IS_DNODE_DNODE(db->db.db_object)) {
+ for (i = 0; i < TXG_SIZE; i++) {
+ /*
+ * it should only be modified in syncing
+ * context, so make sure we only have
+ * one copy of the data.
+ */
+ ASSERT(db->db_d.db_data_old[i] == NULL ||
+ db->db_d.db_data_old[i] == db->db_buf);
+ }
+ }
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (IS_DNODE_DNODE(db->db.db_object))
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ */
+ if (db->db_dirtycnt == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ }
+ }
+#endif
+}
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ *db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(buf->b_data != NULL);
+ db->db_buf = buf;
+ db->db.db_data = buf->b_data;
+ dbuf_update_data(db);
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+ if (dn->dn_datablkshift) {
+ return (offset >> dn->dn_datablkshift);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db.db_data == NULL);
+ if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+ /* we were freed in flight; disregard any error */
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ db->db_d.db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else if (zio == NULL || zio->io_error == 0) {
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ arc_buf_free(buf, db);
+ db->db_state = DB_UNCACHED;
+ ASSERT3P(db->db_buf, ==, NULL);
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ arc_buf_t *buf;
+ blkptr_t *bp;
+
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+
+ /*
+ * prefetch only data blocks (level 0) -- don't prefetch indirect
+ * blocks
+ */
+ if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
+ flags |= DB_RF_NOPREFETCH;
+ }
+
+ if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size);
+ }
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ return;
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_state != DB_UNCACHED) {
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ DN_MAX_BONUSLEN, db);
+ if (db->db.db_size < DN_MAX_BONUSLEN)
+ bzero(buf->b_data, DN_MAX_BONUSLEN);
+ bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+ db->db.db_size);
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+ bp = NULL;
+ else
+ bp = db->db_blkptr;
+
+ if (bp == NULL)
+ dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+ else
+ dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+ if (bp == NULL || BP_IS_HOLE(bp)) {
+ ASSERT(bp == NULL || BP_IS_HOLE(bp));
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db));
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+ (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+ db->db_level > 0 ? byteswap_uint64_array :
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ ARC_NOWAIT);
+}
+
+static int
+dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+{
+ zio_t *zio;
+ int err;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ if (db->db_state == DB_CACHED)
+ return (0);
+
+ if (db->db_state == DB_UNCACHED) {
+ zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ dbuf_read_impl(db, zio, flags);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ err = zio_wait(zio);
+ if (err)
+ return (err);
+ }
+
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ mutex_exit(&db->db_mtx);
+
+ return (0);
+}
+
+#pragma weak dmu_buf_read = dbuf_read
+void
+dbuf_read(dmu_buf_impl_t *db)
+{
+ int err;
+
+ err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
+ ASSERT(err == 0);
+}
+
+#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
+int
+dbuf_read_canfail(dmu_buf_impl_t *db)
+{
+ return (dbuf_read_generic(db, DB_RF_CANFAIL));
+}
+
+void
+dbuf_read_havestruct(dmu_buf_impl_t *db)
+{
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
+ ASSERT(err == 0);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
+ DN_MAX_BONUSLEN : db->db.db_size;
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ blksz, db));
+ db->db_state = DB_FILL;
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ arc_buf_t **quiescing, **syncing;
+ int size = (db->db_blkid == DB_BONUS_BLKID) ?
+ DN_MAX_BONUSLEN : db->db.db_size;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+
+ quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+ syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+ /*
+ * If this buffer is referenced from the current quiescing
+ * transaction group: either make a copy and reset the reference
+ * to point to the copy, or (if there a no active holders) just
+ * null out the current db_data pointer.
+ */
+ if (*quiescing == db->db_buf) {
+ /*
+ * If the quiescing txg is "dirty", then we better not
+ * be referencing the same buffer from the syncing txg.
+ */
+ ASSERT(*syncing != db->db_buf);
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ *quiescing = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db);
+ bcopy(db->db.db_data, (*quiescing)->b_data, size);
+ } else {
+ db->db.db_data = NULL;
+ db->db_buf = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+ return;
+ }
+
+ /*
+ * If this buffer is referenced from the current syncing
+ * transaction group: either
+ * 1 - make a copy and reset the reference, or
+ * 2 - if there are no holders, just null the current db_data.
+ */
+ if (*syncing == db->db_buf) {
+ ASSERT3P(*quiescing, ==, NULL);
+ ASSERT3U(db->db_dirtycnt, ==, 1);
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ /* we can't copy if we have already started a write */
+ ASSERT(*syncing != db->db_data_pending);
+ *syncing = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db);
+ bcopy(db->db.db_data, (*syncing)->b_data, size);
+ } else {
+ db->db.db_data = NULL;
+ db->db_buf = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+ }
+}
+
+void
+dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
+ db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+ } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+ /* free this block */
+ ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
+ db->db_dnode->dn_free_txg == txg);
+ if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
+ /* XXX can get silent EIO here */
+ (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ txg, db->db_d.db_overridden_by[txg&TXG_MASK],
+ NULL, NULL, ARC_WAIT);
+ }
+ kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
+ sizeof (blkptr_t));
+ db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+ /* release the already-written buffer */
+ arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+ }
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+ if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+ continue;
+ dprintf_dbuf(db, "found buf %s\n", "");
+ if (db->db_blkid < blkid ||
+ db->db_blkid >= blkid+nblks)
+ continue;
+
+ /* found a level 0 buffer in the range */
+ if (dbuf_undirty(db, tx))
+ continue;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ) {
+ /* this will be handled in dbuf_read_done() */
+ db->db_d.db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_FILL) {
+ /* this will be handled in dbuf_rele() */
+ db->db_d.db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+
+ /* make a copy of the data if necessary */
+ dbuf_fix_old_data(db, txg);
+
+ if (db->db.db_data) {
+ /* fill in with appropriate data */
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+ uint64_t birth_txg = 0;
+
+ /* Don't count meta-objects */
+ if (ds == NULL)
+ return (FALSE);
+
+ /*
+ * We don't need any locking to protect db_blkptr:
+ * If it's syncing, then db_dirtied will be set so we'll
+ * ignore db_blkptr.
+ */
+ ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
+ /* If we have been dirtied since the last snapshot, its not new */
+ if (db->db_dirtied)
+ birth_txg = db->db_dirtied;
+ else if (db->db_blkptr)
+ birth_txg = db->db_blkptr->blk_birth;
+
+ if (birth_txg)
+ return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+ else
+ return (TRUE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+ ASSERT3U(osize, <=, size);
+ if (osize == size)
+ return;
+
+ /*
+ * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /* Make a copy of the data if necessary */
+ dbuf_will_dirty(db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, osize);
+ /* zero the remainder */
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
+ dbuf_set_data(db, buf);
+ arc_buf_free(obuf, db);
+ db->db.db_size = size;
+
+ /* fix up the dirty info */
+ if (db->db_level == 0)
+ db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
+ mutex_exit(&db->db_mtx);
+
+ dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+void
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ int drop_struct_lock = FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ dmu_tx_dirty_buf(tx, db);
+
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ * XXX We may want to prohibit dirtying in syncing context even
+ * if they did pre-dirty.
+ */
+ ASSERT(!(dmu_tx_is_syncing(tx) &&
+ !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
+ !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+ dn->dn_objset->os_dsl_dataset != NULL &&
+ !dsl_dir_is_private(
+ dn->dn_objset->os_dsl_dataset->ds_dir)));
+
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED ||
+ dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /* XXX make this true for indirects too? */
+ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
+ db->db_state == DB_FILL);
+
+ /*
+ * If this buffer is currently part of an "overridden" region,
+ * we now need to remove it from that region.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ db->db_d.db_overridden_by[txgoff] != NULL) {
+ dbuf_unoverride(db, tx->tx_txg);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+ !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx =
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ if (list_link_active(&db->db_dirty_node[txgoff])) {
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos, a spa os, or we're initializing the os. However, we are
+ * allowed to dirty in syncing context provided we already
+ * dirtied it in open context. Hence we must make this
+ * assertion only if we're not already dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ os->os_dsl_dataset == NULL ||
+ !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+ !BP_IS_HOLE(&os->os_rootbp));
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ if (db->db_level == 0) {
+ /*
+ * Release the data buffer from the cache so that we
+ * can modify it without impacting possible other users
+ * of this cached data block. Note that indirect blocks
+ * and private objects are not released until the syncing
+ * state (since they are only modified then).
+ *
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ ASSERT(db->db_buf != NULL);
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+ if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ ASSERT(db->db_buf != NULL);
+ }
+ db->db_d.db_data_old[txgoff] = db->db_buf;
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ db->db_d.db_freed_in_flight = FALSE;
+ }
+
+ db->db_dirtied = tx->tx_txg;
+ list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If writting this buffer will consume a new block on disk,
+ * then update the accounting.
+ */
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn,
+ -BP_GET_ASIZE(db->db_blkptr), tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dnode_setdirty(dn, tx);
+ return;
+ }
+
+ if (db->db_level == 0)
+ dnode_new_blkid(dn, db->db_blkid, tx);
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (db->db_level < dn->dn_nlevels-1) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent;
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_dirty(parent, tx);
+ dbuf_remove_ref(parent, FTAG);
+ } else {
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+
+ mutex_enter(&db->db_mtx);
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ if (!list_link_active(&db->db_dirty_node[txgoff])) {
+ mutex_exit(&db->db_mtx);
+ return (0);
+ }
+
+ /*
+ * If this buffer is currently held, we cannot undirty
+ * it, since one of the current holders may be in the
+ * middle of an update. Note that users of dbuf_undirty()
+ * should not place a hold on the dbuf before the call.
+ * XXX - this check assumes we are being called from
+ * dbuf_free_range(), perhaps we should move it there?
+ */
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ dbuf_unoverride(db, tx->tx_txg);
+
+ ASSERT(db->db.db_size != 0);
+ if (db->db_level == 0) {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(db->db_d.db_data_old[txgoff] != NULL);
+ if (db->db_d.db_data_old[txgoff] != db->db_buf)
+ arc_buf_free(db->db_d.db_data_old[txgoff], db);
+ db->db_d.db_data_old[txgoff] = NULL;
+ }
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+ /* XXX undo db_dirtied? but how? */
+ /* db->db_dirtied = tx->tx_txg; */
+
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (refcount_remove(&db->db_holds,
+ (void *)(uintptr_t)tx->tx_txg) == 0) {
+ /* make duf_verify() happy */
+ if (db->db.db_data)
+ bzero(db->db.db_data, db->db.db_size);
+
+ dbuf_evict(db);
+ return (1);
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ int rf = DB_RF_MUST_SUCCEED;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ (void) dbuf_read_generic(db, rf);
+ dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_will_fill = dbuf_will_fill
+void
+dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ dbuf_verify(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_d.db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+
+static void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db_buf != NULL);
+ arc_buf_free(db->db_buf, db);
+ db->db.db_data = NULL;
+ db->db_buf = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If this dbuf is referened from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (db->db_parent && db->db_parent != dn->dn_dbuf)
+ dbuf_remove_ref(db->db_parent, db);
+
+ /* remove from dn_dbufs */
+ list_remove(&dn->dn_dbufs, db);
+
+ dnode_rele(dn, db);
+
+ dbuf_hash_remove(db);
+
+ db->db_dnode = NULL;
+ db->db_parent = NULL;
+ db->db_blkptr = NULL;
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ int nlevels, epbs;
+
+ if (dn->dn_phys->dn_nlevels == 0)
+ nlevels = 1;
+ else
+ nlevels = dn->dn_phys->dn_nlevels;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (blkid == DB_BONUS_BLKID) {
+ /* this is the bonus buffer */
+ *parentp = NULL;
+ *bpp = NULL;
+ return (0);
+ } else if (level >= nlevels ||
+ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ *parentp = NULL;
+ *bpp = NULL;
+ return (ENOENT);
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, NULL, parentp);
+ if (err)
+ return (err);
+ dbuf_read_havestruct(*parentp);
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ *parentp = dn->dn_dbuf;
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_state = DB_UNCACHED;
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ db->db.db_size = dn->dn_bonuslen;
+ db->db.db_offset = DB_BONUS_BLKID;
+ } else {
+ int blocksize =
+ db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ db->db_dirtied = 0;
+ db->db_dirtycnt = 0;
+
+ bzero(&db->db_d, sizeof (db->db_d));
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return (odb);
+ }
+ list_insert_head(&dn->dn_dbufs, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ (void) refcount_add(&dn->dn_holds, db);
+
+ db->db_dnode = dn;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+static int
+dbuf_evictable(dmu_buf_impl_t *db)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ dbuf_verify(db);
+
+ if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
+ return (FALSE);
+
+ if (!refcount_is_zero(&db->db_holds))
+ return (FALSE);
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&db->db_dirty_node[i]));
+ ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+ }
+#endif
+
+ /*
+ * Now we know we want to free it.
+ * This call must be done last, since it has side effects -
+ * calling the db_evict_func().
+ */
+ dbuf_evict_user(db);
+ return (TRUE);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_dnode == NULL);
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ kmem_cache_free(dbuf_cache, db);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+ blkptr_t *bp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /* dbuf_find() returns with db_mtx held */
+ if (db = dbuf_find(dn, 0, blkid)) {
+ /*
+ * This dbuf is already in the cache. We assume that
+ * it is already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
+ if (bp && !BP_IS_HOLE(bp)) {
+ (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+ dmu_ot[dn->dn_type].ot_byteswap,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ (ARC_NOWAIT | ARC_PREFETCH));
+ }
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_rele(parent);
+ }
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = ENOENT;
+ if (err) {
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_rele(parent);
+ return (err);
+ }
+ }
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ /*
+ * If this buffer is currently syncing out, and we are
+ * are still referencing it from db_data, we need to make
+ * a copy of it in case we decide we want to dirty it
+ * again in this txg.
+ */
+ if (db->db_level == 0 && db->db_state == DB_CACHED &&
+ !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+ db->db_data_pending == db->db_buf) {
+ int size = (db->db_blkid == DB_BONUS_BLKID) ?
+ DN_MAX_BONUSLEN : db->db.db_size;
+
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ size, db));
+ bcopy(db->db_data_pending->b_data, db->db.db_data,
+ db->db.db_size);
+ }
+
+ dbuf_add_ref(db, tag);
+ dbuf_update_data(db);
+ dbuf_verify(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_rele(parent);
+
+ ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db;
+ (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
+ return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_bonus(dnode_t *dn, void *tag)
+{
+ dmu_buf_impl_t *db;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ return (db);
+}
+
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ (void) refcount_add(&db->db_holds, tag);
+ /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+}
+
+void
+dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds;
+ dnode_t *dn = db->db_dnode;
+ int need_mutex;
+
+ ASSERT(dn != NULL);
+ need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+
+ if (need_mutex) {
+ dnode_add_ref(dn, FTAG);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+ mutex_enter(&db->db_mtx);
+ dbuf_verify(db);
+
+ holds = refcount_remove(&db->db_holds, tag);
+
+ if (holds == 0) {
+ ASSERT3U(db->db_state, !=, DB_FILL);
+ if (db->db_level == 0 &&
+ db->db_d.db_user_data_ptr_ptr != NULL)
+ *db->db_d.db_user_data_ptr_ptr = NULL;
+ dbuf_evict(db);
+ } else {
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_d.db_immediate_evict)
+ dbuf_evict_user(db);
+ mutex_exit(&db->db_mtx);
+ }
+
+ if (need_mutex) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_rele(dn, FTAG);
+ }
+}
+
+void
+dbuf_rele(dmu_buf_impl_t *db)
+{
+ dbuf_remove_ref(db, NULL);
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_d.db_immediate_evict = TRUE;
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(db->db_level == 0);
+
+ ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_d.db_user_ptr == old_user_ptr) {
+ db->db_d.db_user_ptr = user_ptr;
+ db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_d.db_evict_func = evict_func;
+
+ dbuf_update_data(db);
+ } else {
+ old_user_ptr = db->db_d.db_user_ptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ return (db->db_d.db_user_ptr);
+}
+
+void
+dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
+{
+ arc_buf_t **data;
+ uint64_t txg = tx->tx_txg;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ int blksz;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ dbuf_verify(db);
+
+ /*
+ * Don't need a lock on db_dirty (dn_mtx), because it can't
+ * be modified yet.
+ */
+
+ if (db->db_level == 0) {
+ data = &db->db_d.db_data_old[txg&TXG_MASK];
+ blksz = arc_buf_size(*data);
+ /*
+ * If this buffer is currently "in use" (i.e., there are
+ * active holds and db_data still references it), then make
+ * a copy before we start the write so that any modifications
+ * from the open txg will not leak into this write.
+ *
+ * NOTE: this copy does not need to be made for objects only
+ * modified in the syncing context (e.g. DNONE_DNODE blocks)
+ * or if there is no actual write involved (bonus blocks).
+ */
+ if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+ db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
+ db->db_blkid != DB_BONUS_BLKID) {
+ if (refcount_count(&db->db_holds) > 1 &&
+ *data == db->db_buf) {
+ *data = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, blksz, db);
+ bcopy(db->db.db_data, (*data)->b_data, blksz);
+ }
+ db->db_data_pending = *data;
+ } else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(db->db_buf, db);
+ }
+ } else {
+ data = &db->db_buf;
+ if (*data == NULL) {
+ /*
+ * This can happen if we dirty and then free
+ * the level-0 data blocks in the same txg. So
+ * this indirect remains unchanged.
+ */
+ if (db->db_dirtied == txg)
+ db->db_dirtied = 0;
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ return;
+ }
+ blksz = db->db.db_size;
+ ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
+ }
+
+ ASSERT(*data != NULL);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ /*
+ * Simply copy the bonus data into the dnode. It will
+ * be written out when the dnode is synced (and it will
+ * be synced, since it must have been dirty for dbuf_sync
+ * to be called). The bonus data will be byte swapped
+ * in dnode_byteswap.
+ */
+ /*
+ * Use dn_phys->dn_bonuslen since db.db_size is the length
+ * of the bonus buffer in the open transaction rather than
+ * the syncing transaction.
+ */
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
+ bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
+ dn->dn_phys->dn_bonuslen);
+ if (*data != db->db_buf)
+ arc_buf_free(*data, db);
+ db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+ db->db_data_pending = NULL;
+ if (db->db_dirtied == txg)
+ db->db_dirtied = 0;
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ return;
+ } else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+ /*
+ * This indirect buffer was marked dirty, but
+ * never modified (if it had been modified, then
+ * we would have released the buffer). There is
+ * no reason to write anything.
+ */
+ db->db_data_pending = NULL;
+ if (db->db_dirtied == txg)
+ db->db_dirtied = 0;
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ return;
+ } else if (db->db_blkptr == NULL &&
+ db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid < dn->dn_phys->dn_nblkptr) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ dbuf_verify(db);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_blkptr == NULL) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ mutex_exit(&db->db_mtx);
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, NULL, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_add_ref(parent, db);
+ db->db_parent = parent;
+ dbuf_rele(parent);
+ }
+ dbuf_read(parent);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+ ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+
+ if (db->db_parent != dn->dn_dbuf) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
+ /*
+ * We may have read this block after we dirtied it,
+ * so never released it from the cache.
+ */
+ arc_release(parent->db_buf, parent);
+
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ dbuf_verify(db);
+ mutex_exit(&db->db_mtx);
+ }
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+#ifdef ZFS_DEBUG
+ if (db->db_parent == dn->dn_dbuf) {
+ /*
+ * We don't need to dnode_setdirty(dn) because if we got
+ * here then the parent is already dirty.
+ */
+ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ }
+#endif
+ if (db->db_level == 0 &&
+ db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+ arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+ blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
+ int old_size = BP_GET_ASIZE(db->db_blkptr);
+ int new_size = BP_GET_ASIZE(*bpp);
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ dnode_diduse_space(dn, new_size-old_size);
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
+ if (!BP_IS_HOLE(db->db_blkptr))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ db->db_blkptr, os->os_synctx);
+
+ mutex_enter(&db->db_mtx);
+ *db->db_blkptr = **bpp;
+ kmem_free(*bpp, sizeof (blkptr_t));
+ *bpp = NULL;
+
+ if (*old != db->db_buf)
+ arc_buf_free(*old, db);
+ *old = NULL;
+ db->db_data_pending = NULL;
+
+ cv_broadcast(&db->db_changed);
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ } else {
+ int checksum, compress;
+
+ if (db->db_level > 0) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+ compress = ZIO_COMPRESS_LZJB;
+ } else {
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
+ }
+ }
+#ifdef ZFS_DEBUG
+ if (db->db_parent) {
+ ASSERT(list_link_active(
+ &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+ ASSERT(db->db_parent == dn->dn_dbuf ||
+ db->db_parent->db_level > 0);
+ if (dn->dn_object & DMU_PRIVATE_OBJECT ||
+ db->db_level > 0)
+ ASSERT(*data == db->db_buf);
+ }
+#endif
+ ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+ (void) arc_write(zio, os->os_spa, checksum, compress, txg,
+ db->db_blkptr, *data, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
+ /*
+ * We can't access db after arc_write, since it could finish
+ * and be freed, and we have no locks on it.
+ */
+ }
+}
+
+struct dbuf_arg {
+ objset_impl_t *os;
+ blkptr_t bp;
+};
+
+static void
+dbuf_do_born(void *arg)
+{
+ struct dbuf_arg *da = arg;
+ dsl_dataset_block_born(da->os->os_dsl_dataset,
+ &da->bp, da->os->os_synctx);
+ kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+static void
+dbuf_do_kill(void *arg)
+{
+ struct dbuf_arg *da = arg;
+ dsl_dataset_block_kill(da->os->os_dsl_dataset,
+ &da->bp, da->os->os_synctx);
+ kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = zio->io_txg;
+ uint64_t fill = 0;
+ int i;
+ int old_size, new_size;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
+
+ old_size = BP_GET_ASIZE(&zio->io_bp_orig);
+ new_size = BP_GET_ASIZE(zio->io_bp);
+
+ dnode_diduse_space(dn, new_size-old_size);
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_dirtied == txg)
+ db->db_dirtied = 0;
+
+ if (db->db_level == 0) {
+ arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ if (*old != db->db_buf)
+ arc_buf_free(*old, db);
+ *old = NULL;
+ db->db_data_pending = NULL;
+
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ !BP_IS_HOLE(db->db_blkptr))
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ dnode_phys_t *dnp = db->db.db_data;
+ for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+ i--, dnp++) {
+ if (dnp->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ } else {
+ if (!BP_IS_HOLE(db->db_blkptr))
+ fill = 1;
+ }
+ } else {
+ blkptr_t *bp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ }
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ ASSERT3U(BP_GET_LSIZE(bp), ==,
+ db->db_level == 1 ? dn->dn_datablksz :
+ (1<<dn->dn_phys->dn_indblkshift));
+ fill += bp->blk_fill;
+ }
+ }
+
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ db->db_blkptr->blk_fill = fill;
+ BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+ BP_SET_LEVEL(db->db_blkptr, db->db_level);
+ } else {
+ ASSERT3U(fill, ==, 0);
+ ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ }
+
+ dprintf_dbuf_bp(db, db->db_blkptr,
+ "wrote %llu bytes to blkptr:", zio->io_size);
+
+ ASSERT(db->db_parent == NULL ||
+ list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ struct dbuf_arg *da;
+ da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+ da->os = os;
+ da->bp = *zio->io_bp;
+ (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
+ if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+ da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+ da->os = os;
+ da->bp = zio->io_bp_orig;
+ (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
+ }
+ }
+
+ dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
new file mode 100644
index 0000000000..14fab6d420
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1761 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { byteswap_uint8_array, TRUE, "unallocated" },
+ { zap_byteswap, TRUE, "object directory" },
+ { byteswap_uint64_array, TRUE, "object array" },
+ { byteswap_uint8_array, TRUE, "packed nvlist" },
+ { byteswap_uint64_array, TRUE, "packed nvlist size" },
+ { byteswap_uint64_array, TRUE, "bplist" },
+ { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "SPA space map header" },
+ { byteswap_uint64_array, TRUE, "SPA space map" },
+ { byteswap_uint64_array, TRUE, "ZIL intent log" },
+ { dnode_buf_byteswap, TRUE, "DMU dnode" },
+ { dmu_objset_byteswap, TRUE, "DMU objset" },
+ { byteswap_uint64_array, TRUE, "DSL directory" },
+ { zap_byteswap, TRUE, "DSL directory child map"},
+ { zap_byteswap, TRUE, "DSL dataset snap map" },
+ { zap_byteswap, TRUE, "DSL props" },
+ { byteswap_uint64_array, TRUE, "DSL dataset" },
+ { zfs_znode_byteswap, TRUE, "ZFS znode" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, FALSE, "ZFS plain file" },
+ { zap_byteswap, TRUE, "ZFS directory" },
+ { zap_byteswap, TRUE, "ZFS master node" },
+ { zap_byteswap, TRUE, "ZFS delete queue" },
+ { byteswap_uint8_array, FALSE, "zvol object" },
+ { zap_byteswap, TRUE, "zvol prop" },
+ { byteswap_uint8_array, FALSE, "other uint8[]" },
+ { byteswap_uint64_array, FALSE, "other uint64[]" },
+ { zap_byteswap, TRUE, "other ZAP" },
+};
+
+static int
+dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
+{
+ int i, err = 0;
+ dnode_t *dn;
+ zio_t *zio;
+ int canfail;
+ uint64_t rd_sz;
+
+ if (numbufs == 0)
+ return (0);
+
+ rd_sz = numbufs * dbp[0]->db.db_size;
+ ASSERT(rd_sz <= DMU_MAX_ACCESS);
+
+ dn = dbp[0]->db_dnode;
+ if (flags & DB_RF_CANFAIL) {
+ canfail = 1;
+ } else {
+ canfail = 0;
+ }
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
+
+ /* don't prefetch if read the read is large */
+ if (rd_sz >= zfetch_array_rd_sz) {
+ flags |= DB_RF_NOPREFETCH;
+ }
+
+ /* initiate async reads */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i]->db_state == DB_UNCACHED)
+ dbuf_read_impl(dbp[i], zio, flags);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ err = zio_wait(zio);
+
+ if (err)
+ return (err);
+
+ /* wait for other io to complete */
+ for (i = 0; i < numbufs; i++) {
+ mutex_enter(&dbp[i]->db_mtx);
+ while (dbp[i]->db_state == DB_READ ||
+ dbp[i]->db_state == DB_FILL)
+ cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
+ ASSERT(dbp[i]->db_state == DB_CACHED);
+ mutex_exit(&dbp[i]->db_mtx);
+ }
+
+ return (0);
+}
+
+void
+dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+ int err;
+
+ err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
+ ASSERT(err == 0);
+}
+
+int
+dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
+{
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
+}
+
+dmu_buf_t *
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+
+ /* dataset_verify(dd); */
+
+ dn = dnode_hold(os->os, object, FTAG);
+ blkid = dbuf_whichblock(dn, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (&db->db);
+}
+
+dmu_buf_t *
+dmu_bonus_hold(objset_t *os, uint64_t object)
+{
+ return (dmu_bonus_hold_tag(os, object, NULL));
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_MAX_BONUSLEN);
+}
+
+/*
+ * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ */
+dmu_buf_t *
+dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dmu_buf_impl_t *db;
+
+ if (dn == NULL)
+ return (NULL);
+
+ db = dbuf_hold_bonus(dn, tag);
+ /* XXX - hack: hold the first block if this is a ZAP object */
+ if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ dn->dn_db0 = dbuf_hold(dn, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+ dnode_rele(dn, FTAG);
+ return (&db->db);
+}
+
+static dmu_buf_t **
+dbuf_hold_array(dnode_t *dn,
+ uint64_t offset, uint64_t length, int *numbufsp)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+
+ if (length == 0) {
+ if (numbufsp)
+ *numbufsp = 0;
+ return (NULL);
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+ P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ } else {
+ ASSERT3U(offset + length, <=, dn->dn_datablksz);
+ nblks = 1;
+ }
+ dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *dbuf;
+ dbuf = dbuf_hold(dn, blkid+i);
+ dbp[i] = &dbuf->db;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (numbufsp)
+ *numbufsp = nblks;
+ return (dbp);
+}
+
+dmu_buf_t **
+dmu_buf_hold_array(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length, int *numbufsp)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ if (length == 0) {
+ if (numbufsp)
+ *numbufsp = 0;
+ return (NULL);
+ }
+
+ dn = dnode_hold(os->os, object, FTAG);
+ dbp = dbuf_hold_array(dn, offset, length, numbufsp);
+ dnode_rele(dn, FTAG);
+
+ return (dbp);
+}
+
+void
+dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ dbuf_add_ref(db, tag);
+}
+
+void
+dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *dbuf_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+ /* XXX - hack: hold the first block if this is a ZAP object */
+ if (db->db_blkid == DB_BONUS_BLKID &&
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+ dbuf_rele(db->db_dnode->dn_db0);
+ dbuf_rele(db);
+}
+
+void
+dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+ /* XXX - hack: hold the first block if this is a ZAP object */
+ if (db->db_blkid == DB_BONUS_BLKID &&
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+ dbuf_rele(db->db_dnode->dn_db0);
+ dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
+
+ for (i = 0; i < numbufs; i++)
+ dbuf_rele(dbp[i]);
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, i;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = os->os->os_meta_dnode;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ dn = dnode_hold(os->os, object, FTAG);
+ if (dn == NULL)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+ P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid+i);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+}
+
+static int
+dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf, uint32_t flags)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ dn = dnode_hold(os->os, object, FTAG);
+
+ if (dn->dn_datablkshift == 0) {
+ int newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ dnode_rele(dn, FTAG);
+
+ if (size == 0)
+ return (0);
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
+ err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
+ flags);
+ if (err) {
+ dmu_buf_rele_array(dbp, numbufs);
+ return (err);
+ }
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs);
+ }
+ return (0);
+}
+
+void
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
+{
+ int err;
+
+ err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
+ ASSERT3U(err, ==, 0);
+}
+
+int
+dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
+{
+ return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs);
+}
+
+#ifdef _KERNEL
+int
+dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ uio_t *uio, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err = 0;
+
+ dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * XXX uiomove could block forever (eg. nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that uiomove won't
+ * block.
+ */
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_WRITE, uio);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs);
+ return (err);
+}
+#endif
+
+struct backuparg {
+ dmu_replay_record_t *drr;
+ vnode_t *vp;
+ objset_t *os;
+ int err;
+};
+
+static int
+dump_bytes(struct backuparg *ba, void *buf, int len)
+{
+ ssize_t resid; /* have to get resid to get detailed errno */
+ /* Need to compute checksum here */
+ ASSERT3U(len % 8, ==, 0);
+ ba->err = vn_rdwr(UIO_WRITE, ba->vp,
+ (caddr_t)buf, len,
+ 0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
+ return (ba->err);
+}
+
+static int
+dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ /* write a FREE record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREE;
+ ba->drr->drr_u.drr_free.drr_object = object;
+ ba->drr->drr_u.drr_free.drr_offset = offset;
+ ba->drr->drr_u.drr_free.drr_length = length;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_data(struct backuparg *ba, dmu_object_type_t type,
+ uint64_t object, uint64_t offset, int blksz, void *data)
+{
+ /* write a DATA record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_WRITE;
+ ba->drr->drr_u.drr_write.drr_object = object;
+ ba->drr->drr_u.drr_write.drr_type = type;
+ ba->drr->drr_u.drr_write.drr_offset = offset;
+ ba->drr->drr_u.drr_write.drr_length = blksz;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+{
+ /* write a FREEOBJECTS record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREEOBJECTS;
+ ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
+ ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+{
+ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+ return (dump_freeobjects(ba, object, 1));
+
+ /* write an OBJECT record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_OBJECT;
+ ba->drr->drr_u.drr_object.drr_object = object;
+ ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
+ ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
+ ba->drr->drr_u.drr_object.drr_blksz =
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
+ ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
+ ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+
+ if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+ return (EINTR);
+
+ /* free anything past the end of the file */
+ if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+ return (EINTR);
+ if (ba->err)
+ return (EINTR);
+ return (0);
+}
+
+#define BP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+static int
+backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct backuparg *ba = arg;
+ uint64_t object = bc->bc_bookmark.zb_object;
+ int level = bc->bc_bookmark.zb_level;
+ uint64_t blkid = bc->bc_bookmark.zb_blkid;
+ blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
+ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ void *data = bc->bc_data;
+ int err = 0;
+
+ if (issig(JUSTLOOKING))
+ return (EINTR);
+
+ ASSERT(data || bp == NULL);
+
+ if (bp == NULL && object == 0) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+ err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
+ } else if (bp == NULL) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ err = dump_free(ba, object, blkid * span, span);
+ } else if (data && level == 0 && type == DMU_OT_DNODE) {
+ dnode_phys_t *blk = data;
+ int i;
+ int blksz = BP_GET_LSIZE(bp);
+
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj =
+ (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = dump_dnode(ba, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ } else if (level == 0 &&
+ type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+ int blksz = BP_GET_LSIZE(bp);
+ if (data == NULL) {
+ arc_buf_t *abuf;
+
+ (void) arc_read(NULL, spa, bp,
+ dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
+ ARC_WAIT);
+
+ if (abuf) {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, abuf->b_data);
+ arc_buf_free(abuf, &abuf);
+ }
+ } else {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, data);
+ }
+ }
+
+ ASSERT(err == 0 || err == EINTR);
+ return (err);
+}
+
+int
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
+{
+ dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+ dmu_replay_record_t *drr;
+ struct backuparg ba;
+ int err;
+
+ /* tosnap must be a snapshot */
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (EINVAL);
+
+ /* fromsnap must be an earlier snapshot from the same fs as tosnap */
+ if (fromds && (ds->ds_dir != fromds->ds_dir ||
+ fromds->ds_phys->ds_creation_txg >=
+ ds->ds_phys->ds_creation_txg))
+ return (EXDEV);
+
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+ drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+ drr->drr_u.drr_begin.drr_creation_time =
+ ds->ds_phys->ds_creation_time;
+ drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+ if (fromds)
+ drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
+ dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+
+ ba.drr = drr;
+ ba.vp = vp;
+ ba.os = tosnap;
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (ba.err);
+ }
+
+ err = traverse_dsl_dataset(ds,
+ fromds ? fromds->ds_phys->ds_creation_txg : 0,
+ ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+ backup_cb, &ba);
+
+ if (err) {
+ if (err == EINTR && ba.err)
+ err = ba.err;
+ return (err);
+ }
+
+ bzero(drr, sizeof (dmu_replay_record_t));
+ drr->drr_type = DRR_END;
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
+ return (ba.err);
+
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+
+ return (0);
+}
+
+struct restorearg {
+ int err;
+ int byteswap;
+ vnode_t *vp;
+ char *buf;
+ uint64_t voff;
+ int buflen; /* number of valid bytes in buf */
+ int bufoff; /* next offset to read */
+ int bufsize; /* amount of memory allocated for buf */
+};
+
+static int
+replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct drr_begin *drrb = arg;
+ dsl_dataset_t *ds = NULL;
+ dsl_dataset_t *ds_prev = NULL;
+ const char *snapname;
+ int err = EINVAL;
+ uint64_t val;
+
+ /* this must be a filesytem */
+ if (dd->dd_phys->dd_head_dataset_obj == 0)
+ goto die;
+
+ ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG);
+
+ if (ds == NULL) {
+ err = EBUSY;
+ goto die;
+ }
+
+ /* must already be a snapshot of this fs */
+ if (ds->ds_phys->ds_prev_snap_obj == 0) {
+ err = ENODEV;
+ goto die;
+ }
+
+ /* most recent snapshot must match fromguid */
+ ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+ if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
+ err = ENODEV;
+ goto die;
+ }
+
+ /* must not have any changes since most recent snapshot */
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds_prev->ds_phys->ds_creation_txg) {
+ err = ETXTBSY;
+ goto die;
+ }
+
+ /* new snapshot name must not exist */
+ snapname = strrchr(drrb->drr_toname, '@');
+ if (snapname == NULL) {
+ err = EEXIST;
+ goto die;
+ }
+ snapname++;
+ err = zap_lookup(dd->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
+ if (err != ENOENT) {
+ if (err == 0)
+ err = EEXIST;
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+ return (err);
+ }
+
+ dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+
+ /* The point of no (unsuccessful) return. */
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_restoring = TRUE;
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (0);
+
+die:
+ if (ds_prev)
+ dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+ if (ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+}
+
+static int
+replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct drr_begin *drrb = arg;
+ int err;
+ char *fsfullname, *fslastname, *cp;
+ dsl_dataset_t *ds;
+
+ fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ (void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN);
+ cp = strchr(fsfullname, '@');
+ if (cp == NULL) {
+ kmem_free(fsfullname, MAXNAMELEN);
+ return (EINVAL);
+ }
+ *cp = '\0';
+ fslastname = strrchr(fsfullname, '/');
+ if (fslastname == NULL) {
+ kmem_free(fsfullname, MAXNAMELEN);
+ return (EINVAL);
+ }
+ fslastname++;
+
+ err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx);
+ if (err) {
+ kmem_free(fsfullname, MAXNAMELEN);
+ return (err);
+ }
+
+ /* the point of no (unsuccessful) return */
+
+ err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+ DS_MODE_EXCLUSIVE, FTAG, &ds);
+ ASSERT3U(err, ==, 0);
+ kmem_free(fsfullname, MAXNAMELEN);
+
+ (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, drrb->drr_type, tx);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_restoring = TRUE;
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (0);
+}
+
+static int
+replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct drr_begin *drrb = arg;
+ int err;
+ char *snapname;
+ dsl_dataset_t *ds;
+
+ /* XXX verify that drr_toname is in dd */
+
+ snapname = strchr(drrb->drr_toname, '@');
+ if (snapname == NULL)
+ return (EINVAL);
+ snapname++;
+
+ /* create snapshot */
+ err = dsl_dataset_snapshot_sync(dd, snapname, tx);
+ if (err)
+ return (err);
+
+ /* set snapshot's creation time and guid */
+ err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+ DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
+ ASSERT3U(err, ==, 0);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
+ ds->ds_phys->ds_guid = drrb->drr_toguid;
+ ds->ds_phys->ds_restoring = FALSE;
+
+ dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+
+ ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_restoring = FALSE;
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+
+ return (0);
+}
+
+void *
+restore_read(struct restorearg *ra, int len)
+{
+ void *rv;
+
+ /* some things will require 8-byte alignment, so everything must */
+ ASSERT3U(len % 8, ==, 0);
+
+ while (ra->buflen - ra->bufoff < len) {
+ ssize_t resid;
+ int leftover = ra->buflen - ra->bufoff;
+
+ (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+ ra->err = vn_rdwr(UIO_READ, ra->vp,
+ (caddr_t)ra->buf + leftover, ra->bufsize - leftover,
+ ra->voff, UIO_SYSSPACE, FAPPEND,
+ RLIM_INFINITY, CRED(), &resid);
+
+ /* Need to compute checksum */
+
+ ra->voff += ra->bufsize - leftover - resid;
+ ra->buflen = ra->bufsize - resid;
+ ra->bufoff = 0;
+ if (resid == ra->bufsize - leftover)
+ ra->err = EINVAL;
+ if (ra->err)
+ return (NULL);
+ }
+
+ ASSERT3U(ra->bufoff % 8, ==, 0);
+ ASSERT3U(ra->buflen - ra->bufoff, >=, len);
+ rv = ra->buf + ra->bufoff;
+ ra->bufoff += len;
+ return (rv);
+}
+
+static void
+backup_byteswap(dmu_replay_record_t *drr)
+{
+#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ DO64(drr_begin.drr_magic);
+ DO64(drr_begin.drr_version);
+ DO64(drr_begin.drr_creation_time);
+ DO32(drr_begin.drr_type);
+ DO64(drr_begin.drr_toguid);
+ DO64(drr_begin.drr_fromguid);
+ break;
+ case DRR_OBJECT:
+ DO64(drr_object.drr_object);
+ /* DO64(drr_object.drr_allocation_txg); */
+ DO32(drr_object.drr_type);
+ DO32(drr_object.drr_bonustype);
+ DO32(drr_object.drr_blksz);
+ DO32(drr_object.drr_bonuslen);
+ break;
+ case DRR_FREEOBJECTS:
+ DO64(drr_freeobjects.drr_firstobj);
+ DO64(drr_freeobjects.drr_numobjs);
+ break;
+ case DRR_WRITE:
+ DO64(drr_write.drr_object);
+ DO32(drr_write.drr_type);
+ DO64(drr_write.drr_offset);
+ DO64(drr_write.drr_length);
+ break;
+ case DRR_FREE:
+ DO64(drr_free.drr_object);
+ DO64(drr_free.drr_offset);
+ DO64(drr_free.drr_length);
+ break;
+ case DRR_END:
+ DO64(drr_end.drr_checksum);
+ break;
+ }
+#undef DO64
+#undef DO32
+}
+
+static int
+restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+{
+ int err;
+ dmu_tx_t *tx;
+
+ err = dmu_object_info(os, drro->drr_object, NULL);
+
+ if (err != 0 && err != ENOENT)
+ return (EINVAL);
+
+ if (drro->drr_type == DMU_OT_NONE ||
+ drro->drr_type >= DMU_OT_NUMTYPES ||
+ drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+ drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+ drro->drr_blksz < SPA_MINBLOCKSIZE ||
+ drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ return (EINVAL);
+ }
+
+ tx = dmu_tx_create(os);
+
+ if (err == ENOENT) {
+ /* currently free, want to be allocated */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_claim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ } else {
+ /* currently allocated, want to be allocated */
+ dmu_tx_hold_bonus(tx, drro->drr_object);
+ /*
+ * We may change blocksize, so need to
+ * hold_write
+ */
+ dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ err = dmu_object_reclaim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ }
+ if (err) {
+ dmu_tx_commit(tx);
+ return (EINVAL);
+ }
+
+ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+ dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+
+ if (drro->drr_bonuslen) {
+ dmu_buf_t *db;
+ void *data;
+ db = dmu_bonus_hold(os, drro->drr_object);
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ if (data == NULL) {
+ dmu_tx_commit(tx);
+ return (ra->err);
+ }
+ bcopy(data, db->db_data, db->db_size);
+ if (ra->byteswap) {
+ dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+ drro->drr_bonuslen);
+ }
+ dmu_buf_rele(db);
+ }
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_freeobjects(struct restorearg *ra, objset_t *os,
+ struct drr_freeobjects *drrfo)
+{
+ uint64_t obj;
+
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+ return (EINVAL);
+
+ for (obj = drrfo->drr_firstobj;
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) {
+ dmu_tx_t *tx;
+ int err;
+
+ if (dmu_object_info(os, obj, NULL) != 0)
+ continue;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_free(os, obj, tx);
+ dmu_tx_commit(tx);
+ if (err && err != ENOENT)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+restore_write(struct restorearg *ra, objset_t *os,
+ struct drr_write *drrw)
+{
+ dmu_tx_t *tx;
+ void *data;
+ int err;
+
+ if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+ drrw->drr_type >= DMU_OT_NUMTYPES)
+ return (EINVAL);
+
+ data = restore_read(ra, drrw->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ if (ra->byteswap)
+ dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+ dmu_write(os, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length, data, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_free(struct restorearg *ra, objset_t *os,
+ struct drr_free *drrf)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrf->drr_length != -1ULL &&
+ drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+ return (EINVAL);
+
+ if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_free_range(os, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+int
+dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+ vnode_t *vp, uint64_t voffset)
+{
+ struct restorearg ra;
+ dmu_replay_record_t *drr;
+ char *cp, *tosnap;
+ dsl_dir_t *dd = NULL;
+ objset_t *os = NULL;
+
+ bzero(&ra, sizeof (ra));
+ ra.vp = vp;
+ ra.voff = voffset;
+ ra.bufsize = 1<<20;
+ ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+
+ if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ ra.byteswap = FALSE;
+ } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ ra.byteswap = TRUE;
+ } else {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ if (ra.byteswap) {
+ drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+ drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+ drrb->drr_type = BSWAP_32(drrb->drr_type);
+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+ }
+
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+ tosnap = drrb->drr_toname;
+ if (drrb->drr_version != DMU_BACKUP_VERSION ||
+ drrb->drr_type >= DMU_OST_NUMTYPES ||
+ strchr(drrb->drr_toname, '@') == NULL) {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Process the begin in syncing context.
+ */
+ if (drrb->drr_fromguid) {
+ /* incremental backup */
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ dd = dsl_dir_open(tosnap, FTAG, NULL);
+ *cp = '@';
+ if (dd == NULL) {
+ ra.err = ENOENT;
+ goto out;
+ }
+
+ ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
+ drrb, 1<<20);
+ } else {
+ /* full backup */
+ const char *tail;
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ dd = dsl_dir_open(tosnap, FTAG, &tail);
+ *cp = '@';
+ if (dd == NULL) {
+ ra.err = ENOENT;
+ goto out;
+ }
+ if (tail == NULL) {
+ ra.err = EEXIST;
+ goto out;
+ }
+
+ ra.err = dsl_dir_sync_task(dd, replay_full_sync,
+ drrb, 1<<20);
+ }
+ if (ra.err)
+ goto out;
+
+ /*
+ * Open the objset we are modifying.
+ */
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
+ DS_MODE_PRIMARY | DS_MODE_RESTORE, &os);
+ *cp = '@';
+ ASSERT3U(ra.err, ==, 0);
+
+ /*
+ * Read records and process them.
+ */
+ while (ra.err == 0 &&
+ NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+ if (issig(JUSTLOOKING)) {
+ ra.err = EINTR;
+ goto out;
+ }
+
+ if (ra.byteswap)
+ backup_byteswap(drr);
+
+ switch (drr->drr_type) {
+ case DRR_OBJECT:
+ {
+ /*
+ * We need to make a copy of the record header,
+ * because restore_{object,write} may need to
+ * restore_read(), which will invalidate drr.
+ */
+ struct drr_object drro = drr->drr_u.drr_object;
+ ra.err = restore_object(&ra, os, &drro);
+ break;
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects drrfo =
+ drr->drr_u.drr_freeobjects;
+ ra.err = restore_freeobjects(&ra, os, &drrfo);
+ break;
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write drrw = drr->drr_u.drr_write;
+ ra.err = restore_write(&ra, os, &drrw);
+ break;
+ }
+ case DRR_FREE:
+ {
+ struct drr_free drrf = drr->drr_u.drr_free;
+ ra.err = restore_free(&ra, os, &drrf);
+ break;
+ }
+ case DRR_END:
+ /* Need to verify checksum. */
+ /*
+ * dd may be the parent of the dd we are
+ * restoring into (eg. if it's a full backup).
+ */
+ ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
+ ds_dir, replay_end_sync, drrb, 1<<20);
+ goto out;
+ default:
+ ra.err = EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ if (os)
+ dmu_objset_close(os);
+
+ /*
+ * Make sure we don't rollback/destroy unless we actually
+ * processed the begin properly. 'os' will only be set if this
+ * is the case.
+ */
+ if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) {
+ /*
+ * rollback or destroy what we created, so we don't
+ * leave it in the restoring state.
+ */
+ txg_wait_synced(dd->dd_pool, 0);
+ if (drrb->drr_fromguid) {
+ /* incremental: rollback to most recent snapshot */
+ (void) dsl_dir_sync_task(dd,
+ dsl_dataset_rollback_sync, NULL, 0);
+ } else {
+ /* full: destroy whole fs */
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ cp = strchr(tosnap, '/');
+ if (cp) {
+ (void) dsl_dir_sync_task(dd,
+ dsl_dir_destroy_sync, cp+1, 0);
+ }
+ cp = strchr(tosnap, '\0');
+ *cp = '@';
+ }
+
+ }
+
+ if (dd)
+ dsl_dir_close(dd, FTAG);
+ kmem_free(ra.buf, ra.bufsize);
+ if (sizep)
+ *sizep = ra.voff;
+ return (ra.err);
+}
+
+/*
+ * Intent log support: sync the block at <os, object, offset> to disk.
+ * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
+ * of the same block, and for making sure that the data isn't changing
+ * while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EALREADY: this txg has already been synced, so there's nothing to to.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EINPROGRESS: the block is in the process of being synced by the
+ * usual mechanism (spa_sync()), so we can't sync it here.
+ * The caller should txg_wait_synced() and not log the write.
+ *
+ * EBUSY: another thread is trying to dmu_sync() the same dbuf.
+ * (This case cannot arise under the current locking rules.)
+ * The caller should txg_wait_synced() and not log the write.
+ *
+ * ESTALE: the block was dirtied or freed while we were writing it,
+ * so the data is no longer valid.
+ * The caller should txg_wait_synced() and not log the write.
+ *
+ * 0: success. Sets *bp to the blkptr just written, and sets
+ * *blkoff to the data's offset within that block.
+ * The caller should log this blkptr/blkoff in its lr_write_t.
+ */
+int
+dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+ blkptr_t *bp, uint64_t txg)
+{
+ dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+ tx_state_t *tx = &dp->dp_tx;
+ dmu_buf_impl_t *db;
+ blkptr_t *blk;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT(txg != 0);
+
+ dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+ txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+ /*
+ * If this txg already synced, there's nothing to do.
+ */
+ if (txg <= tx->tx_synced_txg) {
+ /*
+ * If we're running ziltest, we need the blkptr regardless.
+ */
+ if (txg > spa_freeze_txg(dp->dp_spa)) {
+ db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+ /* if db_blkptr == NULL, this was an empty write */
+ if (db->db_blkptr)
+ *bp = *db->db_blkptr; /* structure assignment */
+ else
+ bzero(bp, sizeof (blkptr_t));
+ *blkoff = offset - db->db.db_offset;
+ ASSERT3U(*blkoff, <, db->db.db_size);
+ dmu_buf_rele((dmu_buf_t *)db);
+ return (0);
+ }
+ return (EALREADY);
+ }
+
+ /*
+ * If this txg is in the middle of syncing, just wait for it.
+ */
+ if (txg == tx->tx_syncing_txg) {
+ ASSERT(txg != tx->tx_open_txg);
+ return (EINPROGRESS);
+ }
+
+ db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+
+ mutex_enter(&db->db_mtx);
+
+ /*
+ * If this dbuf isn't dirty, must have been free_range'd.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_rele((dmu_buf_t *)db);
+ return (ENOENT);
+ }
+
+ blk = db->db_d.db_overridden_by[txg&TXG_MASK];
+
+ /*
+ * If we already did a dmu_sync() of this dbuf in this txg,
+ * free the old block before writing the new one.
+ */
+ if (blk != NULL) {
+ ASSERT(blk != IN_DMU_SYNC);
+ if (blk == IN_DMU_SYNC) {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_rele((dmu_buf_t *)db);
+ return (EBUSY);
+ }
+ arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+ if (!BP_IS_HOLE(blk)) {
+ (void) arc_free(NULL, os->os->os_spa, txg, blk,
+ NULL, NULL, ARC_WAIT);
+ }
+ kmem_free(blk, sizeof (blkptr_t));
+ }
+
+ db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
+ mutex_exit(&db->db_mtx);
+
+ blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ blk->blk_birth = 0; /* mark as invalid */
+
+ err = arc_write(NULL, os->os->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+ txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ ASSERT(err == 0);
+
+ if (!BP_IS_HOLE(blk)) {
+ blk->blk_fill = 1;
+ BP_SET_TYPE(blk, db->db_dnode->dn_type);
+ BP_SET_LEVEL(blk, 0);
+ }
+
+ /* copy the block pointer back to caller */
+ *bp = *blk; /* structure assignment */
+ *blkoff = offset - db->db.db_offset;
+ ASSERT3U(*blkoff, <, db->db.db_size);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
+ /* we were dirtied/freed during the sync */
+ ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
+ arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+ mutex_exit(&db->db_mtx);
+ dmu_buf_rele((dmu_buf_t *)db);
+ /* Note that this block does not free on disk until txg syncs */
+
+ /*
+ * XXX can we use ARC_NOWAIT here?
+ * XXX should we be ignoring the return code?
+ */
+ if (!BP_IS_HOLE(blk)) {
+ (void) arc_free(NULL, os->os->os_spa, txg, blk,
+ NULL, NULL, ARC_WAIT);
+ }
+ kmem_free(blk, sizeof (blkptr_t));
+ return (ESTALE);
+ }
+
+ db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+ mutex_exit(&db->db_mtx);
+ dmu_buf_rele((dmu_buf_t *)db);
+ ASSERT3U(txg, >, tx->tx_syncing_txg);
+ return (0);
+}
+
+uint64_t
+dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ uint64_t rv = dnode_max_nonzero_offset(dn);
+ dnode_rele(dn, FTAG);
+ return (rv);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ int err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int i, err;
+
+ dn = dnode_hold(os->os, object, FTAG);
+ /*
+ * Sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_dirtyblksz[i])
+ break;
+ }
+ if (i != TXG_SIZE) {
+ dnode_rele(dn, FTAG);
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ dn = dnode_hold(os->os, object, FTAG);
+ }
+
+ err = dnode_next_offset(dn, hole, off, 1, 1);
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_physical_blks = dn->dn_phys->dn_secphys;
+ doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_bonus_type = dn->dn_bonustype;
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn = dnode_hold(os->os, object, FTAG);
+
+ if (dn == NULL)
+ return (ENOENT);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ *blksize = dn->dn_datablksz;
+ *nblk512 = dn->dn_phys->dn_secphys + 1; /* add 1 for dnode space */
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ dbuf_init();
+ dnode_init();
+ arc_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini();
+ dnode_fini();
+ dbuf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 0000000000..d150d6c400
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ objset_impl_t *osi = os->os;
+ uint64_t object;
+ uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+ (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn;
+ int restarted = B_FALSE;
+
+ mutex_enter(&osi->os_obj_lock);
+ for (;;) {
+ object = osi->os_obj_next;
+ /*
+ * Each time we polish off an L2 bp worth of dnodes
+ * (2^13 objects), move to another L2 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning once, but after that keep looking from here.
+ * If we can't find one, just keep going from here.
+ */
+ if (P2PHASE(object, L2_dnode_count) == 0) {
+ uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+ int error = dnode_next_offset(osi->os_meta_dnode,
+ B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2);
+ restarted = B_TRUE;
+ if (error == 0)
+ object = offset >> DNODE_SHIFT;
+ }
+ osi->os_obj_next = ++object;
+
+ dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+ if (dn)
+ break;
+
+ if (dmu_object_next(os, &object, B_TRUE) == 0)
+ osi->os_obj_next = object - 1;
+ }
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ mutex_exit(&osi->os_obj_lock);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+ if (dn == NULL)
+ return (EEXIST);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+ if (dn == NULL)
+ return (EBADF);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+
+ dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+ if (dn == NULL)
+ return (ENOENT);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole)
+{
+ uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ int error;
+
+ error = dnode_next_offset(os->os->os_meta_dnode,
+ hole, &offset, 0, DNODES_PER_BLOCK);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 0000000000..9bb621b9a1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,727 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+}
+
+objset_impl_t *
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+{
+ objset_impl_t *winner, *osi;
+ int i, err, checksum;
+
+ osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+ osi->os.os = osi;
+ osi->os_dsl_dataset = ds;
+ osi->os_spa = spa;
+ if (bp)
+ osi->os_rootbp = *bp;
+ osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
+ if (!BP_IS_HOLE(&osi->os_rootbp)) {
+ dprintf_bp(&osi->os_rootbp, "reading %s", "");
+ (void) arc_read(NULL, spa, &osi->os_rootbp,
+ dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ arc_bcopy_func, osi->os_phys,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ } else {
+ bzero(osi->os_phys, sizeof (objset_phys_t));
+ }
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off).
+ */
+ if (ds) {
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ ASSERT(err == 0);
+
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ ASSERT(err == 0);
+ } else {
+ /* It's the meta-objset. */
+ osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_compress = ZIO_COMPRESS_LZJB;
+ }
+
+ /*
+ * Metadata always gets compressed and checksummed.
+ * If the data checksum is multi-bit correctable, and it's not
+ * a ZBT-style checksum, then it's suitable for metadata as well.
+ * Otherwise, the metadata checksum defaults to fletcher4.
+ */
+ checksum = osi->os_checksum;
+
+ if (zio_checksum_table[checksum].ci_correctable &&
+ !zio_checksum_table[checksum].ci_zbt)
+ osi->os_md_checksum = checksum;
+ else
+ osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+ osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ }
+ list_create(&osi->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ osi->os_meta_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+ if (ds != NULL) {
+ winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
+ if (winner) {
+ dmu_objset_evict(ds, osi);
+ osi = winner;
+ }
+ }
+
+ return (osi);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+ objset_t *os;
+ objset_impl_t *osi;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dsl_dataset_open(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ osi = dsl_dataset_get_user_ptr(ds);
+ if (osi == NULL) {
+ blkptr_t bp;
+
+ dsl_dataset_get_blkptr(ds, &bp);
+ osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+ }
+
+ os->os = osi;
+ os->os_mode = mode;
+
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
+ dmu_objset_close(os);
+ return (EINVAL);
+ }
+ *osp = os;
+ return (0);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+ dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+ objset_impl_t *osi = arg;
+ int err, i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+ ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+ }
+
+ if (ds) {
+ err = dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi);
+ ASSERT(err == 0);
+
+ err = dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi);
+ ASSERT(err == 0);
+ }
+
+ ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+ dnode_special_close(osi->os_meta_dnode);
+ zil_free(osi->os_zil);
+
+ zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+ kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
+ dmu_tx_t *tx)
+{
+ objset_impl_t *osi;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ osi = dmu_objset_open_impl(spa, ds, NULL);
+ mdn = osi->os_meta_dnode;
+
+ dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+ DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL)
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ osi->os_phys->os_type = type;
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (osi);
+}
+
+struct oscarg {
+ void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ void *userarg;
+ dsl_dataset_t *clone_parent;
+ const char *fullname;
+ const char *lastname;
+ dmu_objset_type_t type;
+};
+
+static int
+dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct oscarg *oa = arg;
+ dsl_dataset_t *ds;
+ int err;
+ blkptr_t bp;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname,
+ oa->clone_parent, tx);
+ dprintf_dd(dd, "fn=%s ln=%s err=%d\n",
+ oa->fullname, oa->lastname, err);
+ if (err)
+ return (err);
+
+ err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+ ASSERT3U(err, ==, 0);
+ dsl_dataset_get_blkptr(ds, &bp);
+ if (BP_IS_HOLE(&bp)) {
+ objset_impl_t *osi;
+
+ /* This is an empty dmu_objset; not a clone. */
+ osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, oa->type, tx);
+
+ if (oa->userfunc)
+ oa->userfunc(&osi->os, oa->userarg, tx);
+ }
+ dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+ return (0);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+{
+ dsl_dir_t *pds;
+ const char *tail;
+ int err = 0;
+
+ pds = dsl_dir_open(name, FTAG, &tail);
+ if (pds == NULL)
+ return (ENOENT);
+ if (tail == NULL) {
+ dsl_dir_close(pds, FTAG);
+ return (EEXIST);
+ }
+
+ dprintf("name=%s\n", name);
+
+ if (tail[0] == '@') {
+ /*
+ * If we're creating a snapshot, make sure everything
+ * they might want is on disk. XXX Sketchy to know
+ * about snapshots here, better to put in DSL.
+ */
+ objset_t *os;
+ size_t plen = strchr(name, '@') - name + 1;
+ char *pbuf = kmem_alloc(plen, KM_SLEEP);
+ bcopy(name, pbuf, plen - 1);
+ pbuf[plen - 1] = '\0';
+
+ err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (err == 0) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0) {
+ err = dsl_dir_sync_task(pds,
+ dsl_dataset_snapshot_sync,
+ (void*)(tail+1), 16*1024);
+ zil_resume(dmu_objset_zil(os));
+ }
+ dmu_objset_close(os);
+ }
+ kmem_free(pbuf, plen);
+ } else {
+ struct oscarg oa = { 0 };
+ oa.userfunc = func;
+ oa.userarg = arg;
+ oa.fullname = name;
+ oa.lastname = tail;
+ oa.type = type;
+ if (clone_parent != NULL) {
+ /*
+ * You can't clone to a different type.
+ */
+ if (clone_parent->os->os_phys->os_type != type) {
+ dsl_dir_close(pds, FTAG);
+ return (EINVAL);
+ }
+ oa.clone_parent = clone_parent->os->os_dsl_dataset;
+ }
+ err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa,
+ 256*1024);
+ }
+ dsl_dir_close(pds, FTAG);
+ return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * If it looks like we'll be able to destroy it, and there's
+ * an unplayed replay log sitting around, destroy the log.
+ * It would be nicer to do this in dsl_dataset_destroy_sync(),
+ * but the replay log objset is modified in open context.
+ */
+ error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ if (error == 0) {
+ zil_destroy(dmu_objset_zil(os));
+ dmu_objset_close(os);
+ }
+
+ /* XXX uncache everything? */
+ return (dsl_dataset_destroy(name));
+}
+
+int
+dmu_objset_rollback(const char *name)
+{
+ int err;
+ objset_t *os;
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ if (err == 0) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0)
+ zil_resume(dmu_objset_zil(os));
+ dmu_objset_close(os);
+ if (err == 0) {
+ /* XXX uncache everything? */
+ err = dsl_dataset_rollback(name);
+ }
+ }
+ return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn = list_head(list);
+ int level, err;
+
+ for (level = 0; dn = list_head(list); level++) {
+ zio_t *zio;
+ zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+ ASSERT3U(level, <=, DN_MAX_LEVELS);
+
+ while (dn) {
+ dnode_t *next = list_next(list, dn);
+
+ list_remove(list, dn);
+ if (dnode_sync(dn, level, zio, tx) == 0) {
+ /*
+ * This dnode requires syncing at higher
+ * levels; put it back onto the list.
+ */
+ if (next)
+ list_insert_before(list, next, dn);
+ else
+ list_insert_tail(list, dn);
+ }
+ dn = next;
+ }
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ }
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+ objset_phys_t *osphys = zio->io_data;
+ dnode_phys_t *dnp = &osphys->os_meta_dnode;
+ int i;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ /*
+ * Update rootbp fill count.
+ */
+ os->os_rootbp.blk_fill = 1; /* count the meta-dnode */
+ for (i = 0; i < dnp->dn_nblkptr; i++)
+ os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+ BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
+ BP_SET_LEVEL(zio->io_bp, 0);
+
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
+ os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
+ os->os_synctx);
+ }
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
+{
+ extern taskq_t *dbuf_tq;
+ int txgoff;
+ list_t *dirty_list;
+ int err;
+ arc_buf_t *abuf =
+ arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(os->os_synctx == NULL);
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
+
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+
+ /*
+ * Sync meta-dnode
+ */
+ dirty_list = &os->os_dirty_dnodes[txgoff];
+ ASSERT(list_head(dirty_list) == NULL);
+ list_insert_tail(dirty_list, os->os_meta_dnode);
+ dmu_objset_sync_dnodes(os, dirty_list, tx);
+
+ /*
+ * Sync the root block.
+ */
+ bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+ err = arc_write(NULL, os->os_spa, os->os_md_checksum,
+ os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ ASSERT(err == 0);
+ arc_buf_free(abuf, FTAG);
+
+ dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
+
+ ASSERT3P(os->os_synctx, ==, tx);
+ taskq_wait(dbuf_tq);
+ os->os_synctx = NULL;
+}
+
+void
+dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds)
+{
+ if (os->os->os_dsl_dataset != NULL) {
+ dsl_dataset_stats(os->os->os_dsl_dataset, dds);
+ } else {
+ ASSERT(os->os->os_phys->os_type == DMU_OST_META);
+ bzero(dds, sizeof (*dds));
+ }
+ dds->dds_type = os->os->os_phys->os_type;
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0)
+ return (ENOENT);
+
+ if (strlen(attr.za_name) + 1 > namelen)
+ return (ENAMETOOLONG);
+
+ (void) strcpy(name, attr.za_name);
+ *id = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+
+ return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+void
+dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ objset_t *os;
+ uint64_t snapobj;
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ char *child;
+ int do_self;
+
+ dd = dsl_dir_open(name, FTAG, NULL);
+ if (dd == NULL)
+ return;
+
+ do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+
+ /*
+ * Iterate over all children.
+ */
+ if (dd->dd_phys->dd_child_dir_zapobj != 0) {
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ /*
+ * No separating '/' because parent's name ends in /.
+ */
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "/");
+ (void) strcat(child, attr.za_name);
+ dmu_objset_find(child, func, arg, flags);
+ kmem_free(child, MAXPATHLEN);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if ((flags & DS_FIND_SNAPSHOTS) &&
+ dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+ snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+ dmu_objset_close(os);
+
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr.za_name);
+ func(child, arg);
+ kmem_free(child, MAXPATHLEN);
+ }
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ /*
+ * Apply to self if appropriate.
+ */
+ if (do_self)
+ func(name, arg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 0000000000..036e3965cf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,792 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+
+#define BP_SPAN_SHIFT(level, width) ((level) * (width))
+
+#define BP_EQUAL(b1, b2) \
+ (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
+ (b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ * objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ * object 0, 1, 2, ..., ZB_MAXOBJECT.
+ * blkoff 0, 1, 2, ...
+ * level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ * < objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ * objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ * object 1, 2, ..., ZB_MAXOBJECT, 0.
+ * blkoff 1, 2, ...
+ * level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ * < objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ * < objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+ int advance)
+{
+ int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+ uint64_t sblkoff, eblkoff;
+ int slevel, elevel, wshift;
+
+ if (szb->zb_objset + bias < ezb->zb_objset + bias)
+ return (-1);
+
+ if (szb->zb_objset + bias > ezb->zb_objset + bias)
+ return (1);
+
+ slevel = szb->zb_level;
+ elevel = ezb->zb_level;
+
+ if ((slevel | elevel) < 0)
+ return ((slevel ^ bias) - (elevel ^ bias));
+
+ if (szb->zb_object + bias < ezb->zb_object + bias)
+ return (-1);
+
+ if (szb->zb_object + bias > ezb->zb_object + bias)
+ return (1);
+
+ if (dnp == NULL)
+ return (0);
+
+ wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+ eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+ if (sblkoff < eblkoff)
+ return (-1);
+
+ if (sblkoff > eblkoff)
+ return (1);
+
+ return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define SET_BOOKMARK_LB(zb, level, blkid) \
+{ \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (objset >= ZB_MAXOBJSET)
+ return (ERANGE);
+ SET_BOOKMARK(zb, objset, 0, -1, 0);
+ } else {
+ if (objset >= ZB_MAXOBJSET)
+ objset = 0;
+ SET_BOOKMARK(zb, objset, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (object >= ZB_MAXOBJECT) {
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+ } else {
+ SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+ }
+ } else {
+ if (zb->zb_object == 0) {
+ SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+ } else {
+ if (object >= ZB_MAXOBJECT)
+ object = 0;
+ SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+ }
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_level == -1);
+ ASSERT(zb->zb_blkid == 0);
+
+ if (advance & ADVANCE_PRE) {
+ SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+ } else {
+ if (zb->zb_objset == 0)
+ return (ERANGE);
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int level = zb->zb_level;
+ uint64_t blkid = zb->zb_blkid;
+
+ if (advance & ADVANCE_PRE) {
+ if (level > 0 && rc == 0) {
+ level--;
+ blkid <<= wshift;
+ } else {
+ blkid++;
+
+ if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid)
+ return (ERANGE);
+
+ while (level < maxlevel) {
+ if (P2PHASE(blkid, 1ULL << wshift))
+ break;
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ } else {
+ if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+ blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+ level = 0;
+ } else {
+ blkid >>= wshift;
+ level++;
+ }
+
+ while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid) {
+ if (level == maxlevel)
+ return (ERANGE);
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ SET_BOOKMARK_LB(zb, level, blkid);
+
+ if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+ /*
+ * Before we issue the callback, prune against maxtxg.
+ *
+ * We prune against mintxg before we get here because it's a big win.
+ * If a given block was born in txg 37, then we know that the entire
+ * subtree below that block must have been born in txg 37 or earlier.
+ * We can therefore lop off huge branches of the tree as we go.
+ *
+ * There's no corresponding optimization for maxtxg because knowing
+ * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+ * children. In fact, the copy-on-write design of ZFS ensures that
+ * top-level blocks will pretty much always be new.
+ *
+ * Therefore, in the name of simplicity we don't prune against
+ * maxtxg until the last possible moment -- that being right now.
+ */
+ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+ return (0);
+
+ if (bc->bc_errno == 0) {
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zbookmark_t *szb = &zseg->seg_start;
+ zbookmark_t *ezb = &zseg->seg_end;
+ zbookmark_t *lzb = &th->th_lastcb;
+ dnode_phys_t *dnp = bc->bc_dnode;
+
+ /*
+ * Debugging: verify that the order we visit things
+ * agrees with the order defined by compare_bookmark().
+ */
+ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+ ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+ ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+ lzb->zb_level == ZB_NO_LEVEL);
+ *lzb = *zb;
+ }
+
+ th->th_callbacks++;
+ return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+ dnode_phys_t *dnp)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ int error;
+
+ th->th_hits++;
+
+ bc->bc_dnode = dnp;
+ bc->bc_errno = 0;
+
+ if (BP_EQUAL(&bc->bc_blkptr, bp))
+ return (0);
+
+ bc->bc_blkptr = *bp;
+
+ if (bc->bc_data == NULL)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ return (0);
+ }
+
+ if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+ error = EIO;
+ } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+ error = 0;
+ th->th_arc_hits++;
+ } else {
+ error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+ BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+
+ if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+ (zb->zb_level > 0 ? byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+ BP_GET_LSIZE(bp));
+ th->th_reads++;
+ }
+
+ if (error) {
+ bc->bc_errno = error;
+ error = traverse_callback(th, NULL, bc);
+ ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+ bc->bc_blkptr.blk_birth = -1ULL;
+ }
+
+ dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+ bc - &th->th_cache[0][0], error,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ blkptr_t *bp = dnp->dn_blkptr;
+ int i, first, level;
+ int nbp = dnp->dn_nblkptr;
+ int minlevel = zb->zb_level;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+ uint64_t blkid = zb->zb_blkid >> bp_shift;
+ int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+ int rc;
+
+ if (minlevel > maxlevel || blkid >= nbp)
+ return (ERANGE);
+
+ for (level = maxlevel; level >= minlevel; level--) {
+ first = P2PHASE(blkid, 1ULL << wshift);
+
+ for (i = first; i < nbp; i++)
+ if (bp[i].blk_birth > zseg->seg_mintxg ||
+ BP_IS_HOLE(&bp[i]) && do_holes)
+ break;
+
+ if (i != first) {
+ i--;
+ SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+ return (ENOTBLK);
+ }
+
+ bc = &th->th_cache[depth][level];
+
+ SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+ level, blkid);
+
+ if (rc = traverse_read(th, bc, bp + i, dnp)) {
+ if (rc != EAGAIN) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ }
+ return (rc);
+ }
+
+ if (BP_IS_HOLE(&bp[i])) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ return (0);
+ }
+
+ nbp = 1 << wshift;
+ bp = bc->bc_data;
+ bp_shift -= wshift;
+ blkid = zb->zb_blkid >> bp_shift;
+ }
+
+ return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+ uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+ zseg_t zseg;
+ zbookmark_t *zb = &zseg.seg_start;
+ uint64_t object = *objectp;
+ int i, rc;
+
+ SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+ SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+ zseg.seg_mintxg = txg;
+ zseg.seg_maxtxg = -1ULL;
+
+ for (;;) {
+ rc = find_block(th, &zseg, mdn, depth);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0 && zb->zb_level == 0) {
+ dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+ for (i = 0; i < DNODES_PER_BLOCK; i++) {
+ object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+ if (object >= *objectp &&
+ dnp[i].dn_type != DMU_OT_NONE &&
+ (type == -1 || dnp[i].dn_type == type)) {
+ *objectp = object;
+ *dnpp = &dnp[i];
+ return (0);
+ }
+ }
+ }
+
+ rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+ if (rc == ERANGE)
+ break;
+ }
+
+ if (rc == ERANGE)
+ *objectp = ZB_MAXOBJECT;
+
+ return (rc);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ dnode_phys_t *dn, *dn_tmp;
+ int worklimit = 1000;
+ int rc;
+
+ dprintf("<%llu, %llu, %d, %llx>\n",
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+ rc = traverse_read(th, bc, mosbp, dn);
+
+ if (rc) /* If we get ERESTART, we've got nowhere left to go */
+ return (rc == ERESTART ? EINTR : rc);
+
+ ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+ if (zb->zb_objset != 0) {
+ uint64_t objset = zb->zb_objset;
+ dsl_dataset_phys_t *dsp;
+
+ rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+ DMU_OT_DSL_OBJSET, ZB_MOS_CACHE);
+
+ if (objset != zb->zb_objset)
+ rc = advance_objset(zseg, objset, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dsp = DN_BONUS(dn_tmp);
+
+ bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+ if (rc != 0) {
+ if (rc == ERESTART)
+ rc = advance_objset(zseg, zb->zb_objset + 1,
+ th->th_advance);
+ return (rc);
+ }
+
+ if (th->th_advance & ADVANCE_PRUNE)
+ zseg->seg_mintxg =
+ MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+ }
+
+ if (zb->zb_level == -1) {
+ ASSERT(zb->zb_object == 0);
+
+ if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ }
+
+ return (advance_from_osphys(zseg, th->th_advance));
+ }
+
+ if (zb->zb_object != 0) {
+ uint64_t object = zb->zb_object;
+
+ rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+ zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+ if (object != zb->zb_object)
+ rc = advance_object(zseg, object, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dn = dn_tmp;
+ }
+
+ if (zb->zb_level == ZB_MAXLEVEL)
+ zb->zb_level = dn->dn_nlevels - 1;
+
+ for (;;) {
+ rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0) {
+ bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+ ASSERT(bc->bc_dnode == dn);
+ ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if (BP_IS_HOLE(&bc->bc_blkptr)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ rc = ENOTBLK;
+ }
+ }
+
+ rc = advance_block(zseg, dn, rc, th->th_advance);
+
+ if (rc == ERANGE)
+ break;
+
+ /*
+ * Give spa_sync() a chance to run.
+ */
+ if (spa_traverse_wanted(th->th_spa)) {
+ th->th_syncs++;
+ return (EAGAIN);
+ }
+
+ if (--worklimit == 0)
+ return (EAGAIN);
+ }
+
+ if (rc == ERANGE)
+ rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+ return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+ blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+ traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+ zseg_t *zseg = list_head(&th->th_seglist);
+ uint64_t save_txg; /* XXX won't be necessary with real itinerary */
+ krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+ blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+ int rc;
+
+ if (zseg == NULL)
+ return (0);
+
+ th->th_restarts++;
+
+ save_txg = zseg->seg_mintxg;
+
+ if (!(th->th_advance & ADVANCE_NOLOCK))
+ rw_enter(rw, RW_READER);
+
+ rc = traverse_segment(th, zseg, mosbp);
+ ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+ if (!(th->th_advance & ADVANCE_NOLOCK))
+ rw_exit(rw);
+
+ zseg->seg_mintxg = save_txg;
+
+ if (rc == ERANGE) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ return (EAGAIN);
+ }
+
+ return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included. The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+ uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+ zseg_t *zseg;
+
+ zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+ zseg->seg_mintxg = mintxg;
+ zseg->seg_maxtxg = maxtxg;
+
+ zseg->seg_start.zb_objset = sobjset;
+ zseg->seg_start.zb_object = sobject;
+ zseg->seg_start.zb_level = slevel;
+ zseg->seg_start.zb_blkid = sblkid;
+
+ zseg->seg_end.zb_objset = eobjset;
+ zseg->seg_end.zb_object = eobject;
+ zseg->seg_end.zb_level = elevel;
+ zseg->seg_end.zb_blkid = eblkid;
+
+ list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset, uint64_t object)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, ZB_MAXLEVEL, 0,
+ objset, object, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, 0, 0,
+ objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 0, -1, 0,
+ objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 1, 0, 0,
+ objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ 0, 0, -1, 0,
+ ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ 1, 1, 0, 0,
+ 0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+ int zio_flags)
+{
+ traverse_handle_t *th;
+ int d, l;
+
+ th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+ th->th_spa = spa;
+ th->th_func = func;
+ th->th_arg = arg;
+ th->th_advance = advance;
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ th->th_noread.zb_level = ZB_NO_LEVEL;
+ th->th_zio_flags = zio_flags;
+
+ list_create(&th->th_seglist, sizeof (zseg_t),
+ offsetof(zseg_t, seg_node));
+
+ for (d = 0; d < ZB_DEPTH; d++) {
+ for (l = 0; l < ZB_MAXLEVEL; l++) {
+ if ((advance & ADVANCE_DATA) ||
+ l != 0 || d != ZB_DN_CACHE)
+ th->th_cache[d][l].bc_data =
+ zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ }
+ }
+
+ return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+ int d, l;
+ zseg_t *zseg;
+
+ for (d = 0; d < ZB_DEPTH; d++)
+ for (l = 0; l < ZB_MAXLEVEL; l++)
+ if (th->th_cache[d][l].bc_data != NULL)
+ zio_buf_free(th->th_cache[d][l].bc_data,
+ SPA_MAXBLOCKSIZE);
+
+ while ((zseg = list_head(&th->th_seglist)) != NULL) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ }
+
+ list_destroy(&th->th_seglist);
+
+ dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+ th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+ th->th_syncs, th->th_restarts);
+
+ kmem_free(th, sizeof (*th));
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 0000000000..5dd827e946
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,801 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h> /* for ZAP_BLOCK_SHIFT */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef ZFS_DEBUG
+int dmu_use_tx_debug_bufs = 1;
+#endif
+
+dmu_tx_t *
+dmu_tx_create_ds(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, dth_node));
+ refcount_create(&tx->tx_space_written);
+ refcount_create(&tx->tx_space_freed);
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_ds(NULL);
+
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj || tx->tx_privateobj);
+}
+
+static void
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, dmu_tx_hold_func_t func,
+ uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *dth;
+ dnode_t *dn = NULL;
+
+ if (object != DMU_NEW_OBJECT) {
+ dn = dnode_hold(os->os, object, tx);
+
+ if (tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ ASSERT(dn->dn_assigned_tx == NULL);
+ dn->dn_assigned_txg = tx->tx_txg;
+ dn->dn_assigned_tx = tx;
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ dth->dth_dnode = dn;
+ dth->dth_type = type;
+ dth->dth_func = func;
+ dth->dth_arg1 = arg1;
+ dth->dth_arg2 = arg2;
+ /*
+ * XXX Investigate using a different data structure to keep
+ * track of dnodes in a tx. Maybe array, since there will
+ * generally not be many entries?
+ */
+ list_insert_tail(&tx->tx_holds, dth);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT,
+ NULL, 0, 0);
+ }
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ uint64_t start, end, space;
+ int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+
+ if (len == 0)
+ return;
+
+ min_bs = SPA_MINBLOCKSHIFT;
+ max_bs = SPA_MAXBLOCKSHIFT;
+ min_ibs = DN_MIN_INDBLKSHIFT;
+ max_ibs = DN_MAX_INDBLKSHIFT;
+
+ /*
+ * If there's more than one block, the blocksize can't change,
+ * so we can make a more precise estimate. Alternatively,
+ * if the dnode's ibs is larger than max_ibs, always use that.
+ * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+ * the code will still work correctly on existing pools.
+ */
+ if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ if (dn->dn_datablkshift != 0)
+ min_bs = max_bs = dn->dn_datablkshift;
+ }
+
+ /*
+ * 'end' is the last thing we will access, not one past.
+ * This way we won't overflow when accessing the last byte.
+ */
+ start = P2ALIGN(off, 1ULL << max_bs);
+ end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+ space = end - start + 1;
+
+ start >>= min_bs;
+ end >>= min_bs;
+
+ epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+ /*
+ * The object contains at most 2^(64 - min_bs) blocks,
+ * and each indirect level maps 2^epbs.
+ */
+ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+ start >>= epbs;
+ end >>= epbs;
+ /*
+ * If we increase the number of levels of indirection,
+ * we'll need new blkid=0 indirect blocks. If start == 0,
+ * we're already accounting for that blocks; and if end == 0,
+ * we can't increase the number of levels beyond that.
+ */
+ if (start != 0 && end != 0)
+ space += 1ULL << max_ibs;
+ space += (end - start + 1) << max_ibs;
+ }
+
+ ASSERT(space < 2 * DMU_MAX_ACCESS);
+
+ tx->tx_space_towrite += space;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+ dnode_t *mdn = tx->tx_objset->os->os_meta_dnode;
+ uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1;
+ uint64_t pre_write_space;
+
+ ASSERT(object < DN_MAX_OBJECT);
+ pre_write_space = tx->tx_space_towrite;
+ dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
+ if (dn && dn->dn_dbuf->db_blkptr &&
+ dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+ tx->tx_space_tooverwrite +=
+ tx->tx_space_towrite - pre_write_space;
+ tx->tx_space_towrite = pre_write_space;
+ }
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ dmu_tx_count_write(tx, dn, off, len);
+ dmu_tx_count_dnode(tx, dn);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+ ASSERT(UINT64_MAX - off >= len - 1);
+
+ dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
+ dmu_tx_hold_write_impl, off, len);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ uint64_t blkid, nblks;
+ uint64_t space = 0;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+ ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
+
+ if (dn->dn_datablkshift == 0)
+ return;
+ /*
+ * not that the dnode can change, since it isn't dirty, but
+ * dbuf_hold_impl() wants us to have the struct_rwlock.
+ * also need it to protect dn_maxblkid.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = off >> dn->dn_datablkshift;
+ nblks = (off + len) >> dn->dn_datablkshift;
+
+ if (blkid >= dn->dn_maxblkid)
+ goto out;
+ if (blkid + nblks > dn->dn_maxblkid)
+ nblks = dn->dn_maxblkid - blkid;
+
+ /* don't bother after the 100,000 blocks */
+ nblks = MIN(nblks, 128*1024);
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ int i;
+ for (i = 0; i < nblks; i++) {
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ bp += blkid + i;
+ if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+ dprintf_bp(bp, "can free old%s", "");
+ space += BP_GET_ASIZE(bp);
+ }
+ }
+ goto out;
+ }
+
+ while (nblks) {
+ dmu_buf_impl_t *dbuf;
+ int err, epbs, blkoff, tochk;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ blkoff = P2PHASE(blkid, 1<<epbs);
+ tochk = MIN((1<<epbs) - blkoff, nblks);
+
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+ if (err == 0) {
+ int i;
+ blkptr_t *bp;
+
+ dbuf_read_havestruct(dbuf);
+
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds,
+ bp[i].blk_birth, tx)) {
+ dprintf_bp(&bp[i],
+ "can free old%s", "");
+ space += BP_GET_ASIZE(&bp[i]);
+ }
+ }
+ dbuf_remove_ref(dbuf, FTAG);
+ } else {
+ /* the indirect block is sparse */
+ ASSERT(err == ENOENT);
+ }
+
+ blkid += tochk;
+ nblks -= tochk;
+ }
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+
+ tx->tx_space_tofree += space;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+ int dirty;
+
+ /* first block */
+ if (off != 0 /* || dn->dn_maxblkid == 0 */)
+ dmu_tx_count_write(tx, dn, off, 1);
+ /* last block */
+ if (len != DMU_OBJECT_END)
+ dmu_tx_count_write(tx, dn, off+len, 1);
+
+ dmu_tx_count_dnode(tx, dn);
+
+ if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+ /* XXX locking */
+ dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
+ dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
+ if (dn->dn_assigned_tx != NULL && !dirty)
+ dmu_tx_count_free(tx, dn, off, len);
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE,
+ dmu_tx_hold_free_impl, off, len);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+{
+ uint64_t nblocks;
+ int epbs;
+
+ dmu_tx_count_dnode(tx, dn);
+
+ if (dn == NULL) {
+ /*
+ * Assuming that nops+cops is not super huge, we will be
+ * able to fit a new object's entries into one leaf
+ * block. So there will be at most 2 blocks total,
+ * including the header block.
+ */
+ dmu_tx_count_write(tx, dn, 0, 2 << ZAP_BLOCK_SHIFT);
+ return;
+ }
+
+ ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+ if (dn->dn_maxblkid == 0 && nops == 0) {
+ /*
+ * If there is only one block (i.e. this is a micro-zap)
+ * and we are only doing updates, the accounting is simple.
+ */
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+ tx->tx_space_tooverwrite += dn->dn_datablksz;
+ else
+ tx->tx_space_towrite += dn->dn_datablksz;
+ return;
+ }
+
+ /*
+ * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
+ * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+ */
+ dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
+ (nops * 6ULL + cops * 3ULL) << ZAP_BLOCK_SHIFT);
+
+ /*
+ * If the modified blocks are scattered to the four winds,
+ * we'll have to modify an indirect twig for each.
+ */
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+ tx->tx_space_towrite +=
+ ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
+ dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS,
+ dmu_tx_hold_write_impl, 0, 0);
+}
+
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn,
+ uint64_t space, uint64_t unused)
+{
+ tx->tx_space_towrite += space;
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ ASSERT(tx->tx_txg == 0);
+
+ dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE,
+ dmu_tx_hold_space_impl, space, 0);
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *dth;
+ int holds = 0;
+
+ /*
+ * By asserting that the tx is assigned, we're counting the
+ * number of dn_tx_holds, which is the same as the number of
+ * dn_holds. Otherwise, we'd be counting dn_holds, but
+ * dn_tx_holds could be 0.
+ */
+ ASSERT(tx->tx_txg != 0);
+
+ /* if (tx->tx_anyobj == TRUE) */
+ /* return (0); */
+
+ for (dth = list_head(&tx->tx_holds); dth;
+ dth = list_next(&tx->tx_holds, dth)) {
+ if (dth->dth_dnode && dth->dth_dnode->dn_object == object)
+ holds++;
+ }
+
+ return (holds);
+}
+
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+ dmu_tx_hold_t *dth;
+ int match_object = FALSE, match_offset = FALSE;
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj)
+ return;
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object & DMU_PRIVATE_OBJECT)
+ return;
+
+ for (dth = list_head(&tx->tx_holds); dth;
+ dth = list_next(&tx->tx_holds, dth)) {
+ ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+ if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (dth->dth_dnode == NULL || dth->dth_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (dth->dth_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX dth_arg2 better not be zero... */
+
+ dprintf("found dth type %x beginblk=%llx endblk=%llx\n",
+ dth->dth_type, beginblk, endblk);
+
+ switch (dth->dth_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * buffer so that we don't need to hold it
+ * when creating a new object.
+ */
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ if (blkid == beginblk &&
+ (dth->dth_arg1 != 0 ||
+ dn->dn_maxblkid == 0))
+ match_offset = TRUE;
+ if (blkid == endblk &&
+ dth->dth_arg2 != DMU_OBJECT_END)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ ASSERT(!"bad dth_type");
+ }
+ }
+ if (match_object && match_offset)
+ return;
+ }
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+#endif
+}
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
+{
+ dmu_tx_hold_t *dth;
+ uint64_t lsize, asize, fsize;
+
+ *last_dth = NULL;
+
+ tx->tx_space_towrite = 0;
+ tx->tx_space_tofree = 0;
+ tx->tx_space_tooverwrite = 0;
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+
+ if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+ return (ERESTART);
+
+ for (dth = list_head(&tx->tx_holds); dth;
+ *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+ dnode_t *dn = dth->dth_dnode;
+ if (dn != NULL) {
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ if (txg_how != TXG_WAIT) {
+ mutex_exit(&dn->dn_mtx);
+ return (ERESTART);
+ }
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ }
+ if (dn->dn_assigned_txg == 0) {
+ ASSERT(dn->dn_assigned_tx == NULL);
+ dn->dn_assigned_txg = tx->tx_txg;
+ dn->dn_assigned_tx = tx;
+ } else {
+ ASSERT(dn->dn_assigned_txg == tx->tx_txg);
+ if (dn->dn_assigned_tx != tx)
+ dn->dn_assigned_tx = NULL;
+ }
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ if (dth->dth_func)
+ dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+ }
+
+ /*
+ * Convert logical size to worst-case allocated size.
+ */
+ fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) +
+ tx->tx_space_tofree;
+ lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
+ asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ tx->tx_space_towrite = asize;
+
+ if (tx->tx_dir && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir,
+ lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+ if (err)
+ return (err);
+ }
+
+ return (0);
+}
+
+static uint64_t
+dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth)
+{
+ uint64_t txg = tx->tx_txg;
+ dmu_tx_hold_t *dth;
+
+ ASSERT(txg != 0);
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) {
+ dnode_t *dn = dth->dth_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ dn->dn_assigned_tx = NULL;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_txg = 0;
+ return (txg);
+}
+
+/*
+ * Assign tx to a transaction group. txg_how can be one of:
+ *
+ * (1) TXG_WAIT. If the current open txg is full, waits until there's
+ * a new one. This should be used when you're not holding locks.
+ * If will only fail if we're truly out of space (or over quota).
+ *
+ * (2) TXG_NOWAIT. If we can't assign into the current open txg without
+ * blocking, returns immediately with ERESTART. This should be used
+ * whenever you're holding locks. On an ERESTART error, the caller
+ * should drop locks, do a txg_wait_open(dp, 0), and try again.
+ *
+ * (3) A specific txg. Use this if you need to ensure that multiple
+ * transactions all sync in the same txg. Like TXG_NOWAIT, it
+ * returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ dmu_tx_hold_t *last_dth;
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(txg_how != 0);
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+ ASSERT3U(tx->tx_space_towrite, ==, 0);
+ ASSERT3U(tx->tx_space_tofree, ==, 0);
+
+ while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
+ uint64_t txg = dmu_tx_unassign(tx, last_dth);
+
+ if (err != ERESTART || txg_how != TXG_WAIT)
+ return (err);
+
+ txg_wait_open(tx->tx_pool, txg + 1);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+ if (tx->tx_dir == NULL || delta == 0)
+ return;
+
+ if (delta > 0) {
+ ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+ tx->tx_space_towrite);
+ (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+ } else {
+ (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+ }
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *dth;
+
+ ASSERT(tx->tx_txg != 0);
+
+ while (dth = list_head(&tx->tx_holds)) {
+ dnode_t *dn = dth->dth_dnode;
+
+ list_remove(&tx->tx_holds, dth);
+ kmem_free(dth, sizeof (dmu_tx_hold_t));
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ dn->dn_assigned_tx = NULL;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ dnode_rele(dn, tx);
+ }
+
+ if (tx->tx_dir && tx->tx_space_towrite > 0) {
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+ }
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+ dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+ tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+ tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+ if (tx->tx_debug_buf)
+ kmem_free(tx->tx_debug_buf, 4096);
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *dth;
+
+ ASSERT(tx->tx_txg == 0);
+
+ while (dth = list_head(&tx->tx_holds)) {
+ dnode_t *dn = dth->dth_dnode;
+
+ list_remove(&tx->tx_holds, dth);
+ kmem_free(dth, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+ if (tx->tx_debug_buf)
+ kmem_free(tx->tx_debug_buf, 4096);
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 0000000000..cfaeaf0674
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,603 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+/* max # of streams per zfetch */
+uint32_t zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t zfetch_block_cap = 32;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int dmu_zfetch_find(zfetch_t *, zstream_t *);
+static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
+static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static void dmu_zfetch_stream_update(zfetch_t *, zstream_t *);
+static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear to a pair of existing prefetch
+ * streams. If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+ zstream_t *z_walk;
+ zstream_t *z_comp;
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+ if (zh == NULL) {
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+ }
+
+ for (z_walk = list_head(&zf->zf_stream); z_walk;
+ z_walk = list_next(&zf->zf_stream, z_walk)) {
+ for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+ z_comp = list_next(&zf->zf_stream, z_comp)) {
+ int64_t diff;
+
+ if (z_walk->zst_len != z_walk->zst_stride ||
+ z_comp->zst_len != z_comp->zst_stride) {
+ continue;
+ }
+
+ diff = z_comp->zst_offset - z_walk->zst_offset;
+ if (z_comp->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+
+ diff = z_walk->zst_offset - z_comp->zst_offset;
+ if (z_walk->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+ }
+ }
+
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch. Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+ uint64_t prefetch_tail;
+ uint64_t prefetch_limit;
+ uint64_t prefetch_ofst;
+ uint64_t prefetch_len;
+ uint64_t blocks_fetched;
+
+ zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+ zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+ prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+ (int64_t)(zs->zst_offset + zs->zst_stride));
+ /*
+ * XXX: use a faster division method?
+ */
+ prefetch_limit = zs->zst_offset + zs->zst_len +
+ (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+ while (prefetch_tail < prefetch_limit) {
+ prefetch_ofst = zs->zst_offset + zs->zst_direction *
+ (prefetch_tail - zs->zst_offset);
+
+ prefetch_len = zs->zst_len;
+
+ /*
+ * Don't prefetch beyond the end of the file, if working
+ * backwards.
+ */
+ if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+ (prefetch_ofst > prefetch_tail)) {
+ prefetch_len += prefetch_ofst;
+ prefetch_ofst = 0;
+ }
+
+ /* don't prefetch more than we're supposed to */
+ if (prefetch_len > zs->zst_len)
+ break;
+
+ blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+ prefetch_ofst, zs->zst_len);
+
+ prefetch_tail += zs->zst_stride;
+ /* stop if we've run out of stuff to prefetch */
+ if (blocks_fetched < zs->zst_len)
+ break;
+ }
+ zs->zst_ph_offset = prefetch_tail;
+ zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL) {
+ return;
+ }
+
+ zf->zf_dnode = dno;
+ zf->zf_stream_cnt = 0;
+ zf->zf_alloc_fail = 0;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zst_node));
+
+ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+ uint64_t i;
+
+ fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+ for (i = 0; i < fetchsz; i++) {
+ dbuf_prefetch(dn, blkid + i);
+ }
+
+ return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks. This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact. This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+
+ if (blkid > dn->dn_maxblkid) {
+ return (0);
+ }
+
+ /* compute fetch size */
+ if (blkid + nblks > dn->dn_maxblkid) {
+ fetchsz = dn->dn_maxblkid - blkid;
+ ASSERT(blkid + fetchsz <= dn->dn_maxblkid);
+ } else {
+ fetchsz = nblks;
+ }
+
+
+ return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read. If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh)
+{
+ zstream_t *zs;
+ int64_t diff;
+ int rc = 0;
+
+ if (zh == NULL)
+ return (0);
+
+ /*
+ * XXX: This locking strategy is a bit coarse; however, it's impact has
+ * yet to be tested. If this turns out to be an issue, it can be
+ * modified in a number of different ways.
+ */
+
+ rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+
+ if (zs->zst_len == 0) {
+ /* bogus stream */
+ continue;
+ }
+
+ if (zh->zst_offset - zs->zst_offset < zs->zst_len) {
+ /* already fetched */
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+
+ if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+ /* forward sequential access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_len += zh->zst_len;
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_offset += diff;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : 0;
+ }
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+ /* backwards sequential access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zh->zst_len ?
+ zs->zst_offset - zh->zst_len : 0;
+ zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+ zs->zst_ph_offset - zh->zst_len : 0;
+ zs->zst_len += zh->zst_len;
+
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+ zs->zst_ph_offset - diff : 0;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : zs->zst_len;
+ }
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided forward access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset += zs->zst_stride;
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided reverse access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+ zs->zst_offset - zs->zst_stride : 0;
+ zs->zst_ph_offset = (zs->zst_ph_offset >
+ (2 * zs->zst_stride)) ?
+ (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+ }
+ }
+
+ if (zs) {
+ rc = 1;
+ dmu_zfetch_dofetch(zf, zs);
+ mutex_exit(&zs->zst_lock);
+ }
+
+ rw_exit(&zf->zf_rwlock);
+ return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure. This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice. This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+ zstream_t *zs;
+ zstream_t *zs_next;
+
+ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+ for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ }
+ list_destroy(&zf->zf_stream);
+ rw_destroy(&zf->zf_rwlock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure. Peform the appropriate
+ * book-keeping. It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case. If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+ zstream_t *zs_walk;
+ zstream_t *zs_next;
+
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs_walk);
+
+ if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+ return (0);
+ }
+ }
+
+ list_insert_head(&zf->zf_stream, zs);
+ zf->zf_stream_cnt++;
+
+ return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ break;
+ }
+
+ if (zs) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ bzero(zs, sizeof (zstream_t));
+ } else {
+ zf->zf_alloc_fail++;
+ }
+ rw_exit(&zf->zf_rwlock);
+
+ return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure. Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ list_remove(&zf->zf_stream, zs);
+ zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+ if (zs1->zst_offset != zs2->zst_offset)
+ return (0);
+
+ if (zs1->zst_len != zs2->zst_len)
+ return (0);
+
+ if (zs1->zst_stride != zs2->zst_stride)
+ return (0);
+
+ if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+ return (0);
+
+ if (zs1->zst_cap != zs2->zst_cap)
+ return (0);
+
+ if (zs1->zst_direction != zs2->zst_direction)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * This is the prefetch entry point. It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size)
+{
+ zstream_t zst;
+ zstream_t *newstream;
+ int fetched;
+ int inserted;
+ unsigned int blkshft;
+ uint64_t blksz;
+
+ /* files that aren't ln2 blocksz are only one block -- nothing to do */
+ if (!zf->zf_dnode->dn_datablkshift) {
+ return;
+ }
+
+ /* convert offset and size, into blockid and nblocks */
+ blkshft = zf->zf_dnode->dn_datablkshift;
+ blksz = (1 << blkshft);
+
+ bzero(&zst, sizeof (zstream_t));
+ zst.zst_offset = offset >> blkshft;
+ zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+ P2ALIGN(offset, blksz)) >> blkshft;
+
+ fetched = dmu_zfetch_find(zf, &zst);
+ if (!fetched) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ }
+
+ if (!fetched) {
+ newstream = dmu_zfetch_stream_reclaim(zf);
+
+ /*
+ * we still couldn't find a stream, drop the lock, and allocate
+ * one if possible. Otherwise, give up and go home.
+ */
+ if (newstream == NULL) {
+ uint64_t maxblocks;
+ uint32_t max_streams;
+ uint32_t cur_streams;
+
+ cur_streams = zf->zf_stream_cnt;
+ maxblocks = zf->zf_dnode->dn_maxblkid;
+
+ max_streams = MIN(zfetch_max_streams,
+ (maxblocks / zfetch_block_cap));
+ if (max_streams == 0) {
+ max_streams++;
+ }
+
+ if (cur_streams >= max_streams) {
+ return;
+ }
+
+ newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+ }
+
+ newstream->zst_offset = zst.zst_offset;
+ newstream->zst_len = zst.zst_len;
+ newstream->zst_stride = zst.zst_len;
+ newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+ newstream->zst_cap = zst.zst_len;
+ newstream->zst_direction = ZFETCH_FORWARD;
+ newstream->zst_last = lbolt;
+
+ mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ inserted = dmu_zfetch_stream_insert(zf, newstream);
+ rw_exit(&zf->zf_rwlock);
+
+ if (!inserted) {
+ mutex_destroy(&newstream->zst_lock);
+ kmem_free(newstream, sizeof (zstream_t));
+ }
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
new file mode 100644
index 0000000000..6b25b35ab1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,1304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ int i;
+ dnode_t *dn = arg;
+ bzero(dn, sizeof (dnode_t));
+
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create(&dn->dn_holds);
+ refcount_create(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_create(&dn->dn_ranges[i], free_range_compar,
+ sizeof (free_range_t),
+ offsetof(struct free_range, fr_node));
+ list_create(&dn->dn_dirty_dbufs[i],
+ sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_dirty_node[i]));
+ }
+
+ list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ refcount_destroy(&dn->dn_holds);
+ refcount_destroy(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_destroy(&dn->dn_ranges[i]);
+ list_destroy(&dn->dn_dirty_dbufs[i]);
+ }
+
+ list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ dnode_cache = kmem_cache_create("dnode_t",
+ sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+ kmem_cache_destroy(dnode_cache);
+}
+
+
+void
+dnode_verify(dnode_t *dn)
+{
+#ifdef ZFS_DEBUG
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ ASSERT3U(dn->dn_indblkshift, >=, 0);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+#endif
+}
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_type = BSWAP_8(dnp->dn_type);
+ dnp->dn_indblkshift = BSWAP_8(dnp->dn_indblkshift);
+ dnp->dn_nlevels = BSWAP_8(dnp->dn_nlevels);
+ dnp->dn_nblkptr = BSWAP_8(dnp->dn_nblkptr);
+ dnp->dn_bonustype = BSWAP_8(dnp->dn_bonustype);
+ dnp->dn_checksum = BSWAP_8(dnp->dn_checksum);
+ dnp->dn_compress = BSWAP_8(dnp->dn_compress);
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_secphys = BSWAP_64(dnp->dn_secphys);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ size_t len = DN_MAX_BONUSLEN - off;
+ dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+ }
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ dnode_phys_t *buf = vbuf;
+ int i;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ size >>= DNODE_SHIFT;
+ for (i = 0; i < size; i++) {
+ dnode_byteswap(buf);
+ buf++;
+ }
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+ const free_range_t *rp1 = node1;
+ const free_range_t *rp2 = node2;
+
+ if (rp1->fr_blkid < rp2->fr_blkid)
+ return (-1);
+ else if (rp1->fr_blkid > rp2->fr_blkid)
+ return (1);
+ else return (0);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object)
+{
+ dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ (void) dnode_cons(dn, NULL, 0); /* XXX */
+
+ dn->dn_objset = os;
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec)
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+ mutex_enter(&os->os_lock);
+ list_insert_head(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_impl_t *os = dn->dn_objset;
+
+ mutex_enter(&os->os_lock);
+ list_remove(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+ dmu_zfetch_rele(&dn->dn_zfetch);
+ kmem_cache_free(dnode_cache, dn);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+
+ blocksize = MIN(MAX(blocksize, SPA_MINBLOCKSIZE), SPA_MAXBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+ dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT3U(dn->dn_maxblkid, ==, 0);
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_dirtyblksz[i], ==, 0);
+ ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
+ ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ dnode_setdirty(dn, tx);
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = NULL;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+ ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT(dn->dn_dirtyblksz[0] == 0);
+ ASSERT(dn->dn_dirtyblksz[1] == 0);
+ ASSERT(dn->dn_dirtyblksz[2] == 0);
+ ASSERT(dn->dn_dirtyblksz[3] == 0);
+
+ /*
+ * XXX I should really have a generation number to tell if we
+ * need to do this...
+ */
+ if (blocksize != dn->dn_datablksz ||
+ dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+ /* free all old data */
+ dnode_free_range(dn, 0, -1ULL, tx);
+ }
+
+ /* change blocksize */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dnode_setdblksz(dn, blocksize);
+ dnode_setdirty(dn, tx);
+ /* don't need dd_dirty_mtx, dnode is already dirty */
+ ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0);
+ dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize;
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* change type */
+ dn->dn_type = ot;
+
+ if (dn->dn_bonuslen != bonuslen) {
+ /* change bonus size */
+ if (bonuslen == 0)
+ bonuslen = 1; /* XXX */
+ db = dbuf_hold_bonus(dn, FTAG);
+ dbuf_read(db);
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT(db->db.db_data != NULL);
+ db->db.db_size = bonuslen;
+ mutex_exit(&db->db_mtx);
+ dbuf_dirty(db, tx);
+ }
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ if (db)
+ dbuf_remove_ref(db, FTAG);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+ dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+ dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ dnode_verify(dn);
+ return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+ dnode_t **children_dnodes = arg;
+ int i;
+ int epb = db->db_size >> DNODE_SHIFT;
+
+ for (i = 0; i < epb; i++) {
+ dnode_t *dn = children_dnodes[i];
+ int n;
+
+ if (dn == NULL)
+ continue;
+#ifdef ZFS_DEBUG
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligable for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(refcount_is_zero(&dn->dn_holds));
+ ASSERT(list_head(&dn->dn_dbufs) == NULL);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+ for (n = 0; n < TXG_SIZE; n++)
+ ASSERT(dn->dn_dirtyblksz[n] == 0);
+#endif
+ children_dnodes[i] = NULL;
+ dnode_destroy(dn);
+ }
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * Returns held dnode if the object number is valid, NULL if not.
+ * Note that this will succeed even for free dnodes.
+ */
+dnode_t *
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+{
+ int epb, idx;
+ int drop_struct_lock = FALSE;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_t **children_dnodes;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (NULL);
+
+ mdn = os->os_meta_dnode;
+
+ dnode_verify(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+ db = dbuf_hold(mdn, blk);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ dbuf_read(db);
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb-1);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ if (children_dnodes == NULL) {
+ dnode_t **winner;
+ children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+ KM_SLEEP);
+ if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+ dnode_buf_pageout)) {
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ children_dnodes = winner;
+ }
+ }
+
+ if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_t *winner;
+ dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
+ db, object);
+ winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ if (winner != NULL) {
+ dnode_destroy(dn);
+ dn = winner;
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_free_txg ||
+ ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+ mutex_exit(&dn->dn_mtx);
+ dbuf_rele(db);
+ return (NULL);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (refcount_add(&dn->dn_holds, ref) == 1)
+ dbuf_add_ref(db, dn);
+
+ dnode_verify(dn);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db);
+
+ return (dn);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+dnode_t *
+dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+}
+
+void
+dnode_add_ref(dnode_t *dn, void *ref)
+{
+ ASSERT(refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, ref);
+}
+
+void
+dnode_rele(dnode_t *dn, void *ref)
+{
+ uint64_t refs;
+
+ refs = refcount_remove(&dn->dn_holds, ref);
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && dn->dn_dbuf)
+ dbuf_remove_ref(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (IS_DNODE_DNODE(dn->dn_object))
+ return;
+
+ dnode_verify(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) {
+ mutex_exit(&os->os_lock);
+ return;
+ }
+
+ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz;
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+ list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+ } else {
+ list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+ }
+
+ mutex_exit(&os->os_lock);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintaines a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ (void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+
+ dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+ /* we should be the only holder... hopefully */
+ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If the dnode is already dirty, it needs to be moved from
+ * the dirty list to the free list.
+ */
+ mutex_enter(&dn->dn_objset->os_lock);
+ if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) {
+ list_remove(
+ &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn);
+ list_insert_tail(
+ &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn);
+ mutex_exit(&dn->dn_objset->os_lock);
+ } else {
+ mutex_exit(&dn->dn_objset->os_lock);
+ dnode_setdirty(dn, tx);
+ }
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ int have_db0 = FALSE;
+ int err = ENOTSUP;
+
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ if (size > SPA_MAXBLOCKSIZE)
+ size = SPA_MAXBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = dn->dn_indblkshift;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec &&
+ ibs == dn->dn_indblkshift)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_phys->dn_maxblkid != 0)
+ goto end;
+
+ /*
+ * Any buffers allocated for blocks beyond the first
+ * must be evictable/evicted, because they're the wrong size.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ /*
+ * Since we have the dn_dbufs_mtx, nothing can be
+ * removed from dn_dbufs. Since we have dn_struct_rwlock/w,
+ * nothing can be added to dn_dbufs.
+ */
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+
+ if (db->db_blkid == 0) {
+ have_db0 = TRUE;
+ } else if (db->db_blkid != DB_BONUS_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto end;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ /* Fast-track if there is no data in the file */
+ if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) {
+ dnode_setdblksz(dn, size);
+ dn->dn_indblkshift = ibs;
+ dnode_setdirty(dn, tx);
+ /* don't need dd_dirty_mtx, dnode is already dirty */
+ dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+ }
+
+ /* obtain the old block */
+ db = dbuf_hold(dn, 0);
+
+ /* Not allowed to decrease the size if there is data present */
+ if (size < db->db.db_size) {
+ dbuf_rele(db);
+ goto end;
+ }
+
+ dbuf_new_size(db, size, tx);
+
+ dnode_setdblksz(dn, size);
+ dn->dn_indblkshift = ibs;
+ /* don't need dd_dirty_mtx, dnode is already dirty */
+ dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ dbuf_rele(db);
+
+ err = 0;
+end:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (err);
+}
+
+uint64_t
+dnode_max_nonzero_offset(dnode_t *dn)
+{
+ if (dn->dn_phys->dn_maxblkid == 0 &&
+ BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]))
+ return (0);
+ else
+ return ((dn->dn_phys->dn_maxblkid+1) * dn->dn_datablksz);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int drop_struct_lock = FALSE;
+ int epbs, old_nlevels, new_nlevels;
+ uint64_t sz;
+
+ if (blkid == DB_BONUS_BLKID)
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = blkid;
+
+ /*
+ * Compute the number of levels necessary to support the
+ * new blkid.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr;
+ sz <<= epbs)
+ new_nlevels++;
+ old_nlevels = dn->dn_nlevels;
+
+ if (new_nlevels > dn->dn_next_nlevels[txgoff])
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ if (new_nlevels > old_nlevels) {
+ dprintf("dn %p increasing nlevels from %u to %u\n",
+ dn, dn->dn_nlevels, new_nlevels);
+ dn->dn_nlevels = new_nlevels;
+ }
+
+ /*
+ * Dirty the left indirects.
+ * Note: the caller should have just dnode_use_space()'d one
+ * data block's worth, so we could subtract that out of
+ * dn_inflight_data to determine if there is any dirty data
+ * besides this block.
+ * We don't strictly need to dirty them unless there's
+ * *something* in the object (eg. on disk or dirty)...
+ */
+ if (new_nlevels > old_nlevels) {
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ dprintf("dn %p dirtying left indirects\n", dn);
+ dbuf_dirty(db, tx);
+ dbuf_remove_ref(db, FTAG);
+ }
+#ifdef ZFS_DEBUG
+ else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
+ dmu_buf_impl_t *db;
+ int i;
+
+ for (i = 0; i < dn->dn_nblkptr; i++) {
+ db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
+ ASSERT(!
+ list_link_active(&db->db_dirty_node[txgoff]));
+ dbuf_remove_ref(db, FTAG);
+ }
+ }
+#endif
+
+ dprintf("dn %p done\n", dn);
+
+out:
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+ avl_index_t where;
+ free_range_t *rp;
+ free_range_t rp_tofind;
+ uint64_t endblk = blkid + nblks;
+
+ ASSERT(MUTEX_HELD(&dn->dn_mtx));
+ ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ rp_tofind.fr_blkid = blkid;
+ rp = avl_find(tree, &rp_tofind, &where);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_BEFORE);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_AFTER);
+
+ while (rp && (rp->fr_blkid <= blkid + nblks)) {
+ uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+ free_range_t *nrp = AVL_NEXT(tree, rp);
+
+ if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+ /* clear this entire range */
+ avl_remove(tree, rp);
+ kmem_free(rp, sizeof (free_range_t));
+ } else if (blkid <= rp->fr_blkid &&
+ endblk > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear the beginning of this range */
+ rp->fr_blkid = endblk;
+ rp->fr_nblks = fr_endblk - endblk;
+ } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+ endblk >= fr_endblk) {
+ /* clear the end of this range */
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear a chunk out of this range */
+ free_range_t *new_rp =
+ kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+ new_rp->fr_blkid = endblk;
+ new_rp->fr_nblks = fr_endblk - endblk;
+ avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ }
+ /* there may be no overlap */
+ rp = nrp;
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t start, objsize, blkid, nblks;
+ int blkshift, blksz, tail, head, epbs;
+ int trunc = FALSE;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ /* If the range is past the end of the file, this is a no-op */
+ objsize = blksz * (dn->dn_maxblkid+1);
+ if (off >= objsize)
+ goto out;
+ if (len == -1ULL) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (dn->dn_maxblkid == 0) {
+ if (off == 0) {
+ head = 0;
+ } else {
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ start = off;
+ } else {
+ ASSERT(ISP2(blksz));
+ head = P2NPHASE(off, blksz);
+ start = P2PHASE(off, blksz);
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ ASSERT3U(start + head, ==, blksz);
+ if (len < head)
+ head = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+ FTAG, &db) == 0) {
+ caddr_t data;
+
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_dirtied ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ data = db->db.db_data;
+ bzero(data + start, head);
+ }
+ dbuf_remove_ref(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+ /* If the range was less than one block, we are done */
+ if (len == 0)
+ goto out;
+
+ /* If the remaining range is past the end of the file, we are done */
+ if (off > dn->dn_maxblkid << blkshift)
+ goto out;
+
+ if (off + len == UINT64_MAX)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_dirtied ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_remove_ref(db, FTAG);
+ }
+ len -= tail;
+ }
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ /* dirty the left indirects */
+ if (dn->dn_nlevels > 1 && off != 0) {
+ db = dbuf_hold_level(dn, 1,
+ (off - head) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_remove_ref(db, FTAG);
+ }
+
+ /* dirty the right indirects */
+ if (dn->dn_nlevels > 1 && !trunc) {
+ db = dbuf_hold_level(dn, 1,
+ (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_remove_ref(db, FTAG);
+ }
+
+ /*
+ * Finally, add this range to the dnode range list, we
+ * will finish up this free operation in the syncing phase.
+ */
+ ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+ ASSERT(off + len == UINT64_MAX || IS_P2ALIGNED(len, 1<<blkshift));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+
+ if (trunc)
+ dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, blkid, nblks, tx);
+ {
+ free_range_t *rp, *found;
+ avl_index_t where;
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+ /* Add new range to dn_ranges */
+ rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+ rp->fr_blkid = blkid;
+ rp->fr_nblks = nblks;
+ found = avl_find(tree, rp, &where);
+ ASSERT(found == NULL);
+ avl_insert(tree, rp, where);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, nblks, tx);
+ dnode_setdirty(dn, tx);
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ free_range_t range_tofind;
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DB_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ /*
+ * If dn_datablkshift is not set, then there's only a single
+ * block, in which case there will never be a free range so it
+ * won't matter.
+ */
+ range_tofind.fr_blkid = blkid;
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ free_range_t *range_found;
+ avl_index_t idx;
+
+ range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+ if (range_found) {
+ ASSERT(range_found->fr_nblks > 0);
+ break;
+ }
+ range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+ if (range_found &&
+ range_found->fr_blkid + range_found->fr_nblks > blkid)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t space)
+{
+ uint64_t sectors;
+
+ dprintf_dnode(dn, "dn=%p dnp=%p secphys=%llu space=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_secphys,
+ (longlong_t)space);
+
+ ASSERT(P2PHASE(space, 1<<DEV_BSHIFT) == 0);
+
+ mutex_enter(&dn->dn_mtx);
+ if (space > 0) {
+ sectors = space >> DEV_BSHIFT;
+ ASSERT3U(dn->dn_phys->dn_secphys + sectors, >=,
+ dn->dn_phys->dn_secphys);
+ dn->dn_phys->dn_secphys += sectors;
+ } else {
+ sectors = -space >> DEV_BSHIFT;
+ ASSERT3U(dn->dn_phys->dn_secphys, >=, sectors);
+ dn->dn_phys->dn_secphys -= sectors;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ if (space > 0)
+ space = spa_get_asize(os->os_spa, space);
+
+ if (ds)
+ dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+ dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int lvl, uint64_t blkfill)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ int i, error, span;
+
+ dprintf("probing object %llu offset %llx level %d of %u\n",
+ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ if (error) {
+ if (error == ENOENT)
+ return (hole ? 0 : ESRCH);
+ return (error);
+ }
+ dbuf_read_havestruct(db);
+ data = db->db.db_data;
+ }
+
+ if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+ span = DNODE_SHIFT;
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+ for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ if (!dnp[i].dn_type == hole)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i == blkfill)
+ error = ESRCH;
+ } else {
+ blkptr_t *bp = data;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+ i < epb; i++) {
+ if (bp[i].blk_fill >= minfill &&
+ bp[i].blk_fill <= maxfill)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i >= epb)
+ error = ESRCH;
+ }
+
+ if (db)
+ dbuf_remove_ref(db, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1);
+ * Finds the next hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int minlvl, uint64_t blkfill)
+{
+ int lvl, maxlvl;
+ int error = 0;
+ uint64_t initial_offset = *offset;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ESRCH);
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (hole)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = ESRCH;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (error);
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+ if (error == 0)
+ break;
+ }
+
+ while (--lvl >= minlvl && error == 0)
+ error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (initial_offset > *offset)
+ return (ESRCH);
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 0000000000..56fc3e19ae
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int i;
+ uint64_t txg = tx->tx_txg;
+
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ /* this dnode can't be paged out because it's dirty */
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+ break;
+ if (i != dn->dn_phys->dn_nblkptr) {
+ ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
+
+ dbuf_read_havestruct(db);
+ arc_release(db->db_buf, db);
+ /* copy dnode's block pointers to new indirect block */
+ ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
+ db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+ }
+
+ dn->dn_phys->dn_nlevels += 1;
+ dprintf("os=%p obj=%llu, increase to %d\n",
+ dn->dn_objset, dn->dn_object,
+ dn->dn_phys->dn_nlevels);
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) {
+ dmu_buf_impl_t *child =
+ dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i);
+ if (child == NULL)
+ continue;
+ if (child->db_dnode == NULL) {
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+
+ if (child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf) {
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changing db_blkptr to new indirect %s", "");
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data) {
+ child->db_blkptr =
+ (blkptr_t *)db->db.db_data + i;
+ } else {
+ child->db_blkptr = NULL;
+ }
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+ }
+ ASSERT3P(child->db_parent, ==, db);
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr,
+ sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+
+ dbuf_remove_ref(db, FTAG);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t bytesfreed = 0;
+ int i;
+
+ dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+ for (i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += BP_GET_ASIZE(bp);
+ ASSERT3U(bytesfreed >> DEV_BSHIFT, <=, dn->dn_phys->dn_secphys);
+ dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx);
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+#ifdef ZFS_DEBUG
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ int j;
+ dmu_buf_impl_t *child;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK]));
+
+ /* db_data_old better be zeroed */
+ if (child->db_d.db_data_old[txg & TXG_MASK]) {
+ buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ !list_link_active(&child->db_dirty_node
+ [(txg+1) & TXG_MASK]) &&
+ !list_link_active(&child->db_dirty_node
+ [(txg+2) & TXG_MASK])) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_remove_ref(child, FTAG);
+ }
+#endif
+}
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend, i;
+ int epbs, shift, err;
+ int txg_index = tx->tx_txg&TXG_MASK;
+ int all = TRUE;
+
+ dbuf_read(db);
+ arc_release(db->db_buf, db);
+ bp = (blkptr_t *)db->db.db_data;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ all = FALSE;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+ else if (all)
+ all = trunc;
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ free_verify(db, start, end, tx);
+ free_blocks(dn, bp, end-start+1, tx);
+ ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+ return (all);
+ }
+
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ ASSERT3P(subdb->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ }
+ dbuf_remove_ref(subdb, FTAG);
+ }
+#ifdef ZFS_DEBUG
+ bp -= (end-start)+1;
+ for (i = start; i <= end; i++, bp++) {
+ if (i == start && blkid != 0)
+ continue;
+ else if (i == end && !trunc)
+ continue;
+ ASSERT3U(bp->blk_birth, ==, 0);
+ }
+#endif
+ ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+ return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ dmu_buf_impl_t *db;
+ int trunc, start, end, shift, i, err;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+ if (trunc)
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+ }
+ return;
+ }
+
+ shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ start = blkid >> shift;
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ end = (blkid + nblks - 1) >> shift;
+ bp += start;
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(db, blkid, nblks, trunc, tx)) {
+ ASSERT3P(db->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ }
+ dbuf_remove_ref(db, FTAG);
+ }
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+ }
+}
+
+static int
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ /* Undirty all buffers */
+ while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) {
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+ if (db->db_level == 0) {
+ ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+ if (db->db_d.db_overridden_by[txgoff])
+ dbuf_unoverride(db, tx->tx_txg);
+ db->db_d.db_data_old[txgoff] = NULL;
+ }
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+
+ /* free up all the blocks in the file. */
+ dbuf_free_range(dn, 0, -1, tx);
+ dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+ ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
+
+ /*
+ * All dbufs should be gone, since all holds are gone...
+ */
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dbuf_will_dirty(dn->dn_dbuf, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_dirtyblksz[txgoff] = 0;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we musn't access it.
+ */
+ return (1);
+}
+
+/*
+ * Write out the dnode's dirty buffers at the specified level.
+ * This may create more dirty buffers at the next level up.
+ *
+ * NOTE: The dnode is kept in memory by being dirty. Once the
+ * dirty bit is cleared, it may be evicted. Beware of this!
+ */
+int
+dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
+{
+ free_range_t *rp;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ dnode_phys_t *dnp = dn->dn_phys;
+
+ /* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+ dn->dn_dirtyblksz[txgoff] > 0);
+
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ dnode_verify(dn);
+ /*
+ * Make sure the dbuf for the dn_phys is released before we modify it.
+ */
+ if (dn->dn_dbuf)
+ arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf);
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ /* XXX shouldn't the phys already be zeroed? */
+ bzero(dnp, DNODE_CORE_SIZE);
+ dnp->dn_datablkszsec = dn->dn_datablkszsec;
+ dnp->dn_indblkshift = dn->dn_indblkshift;
+ dnp->dn_nlevels = 1;
+ }
+
+ if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_nblkptr - dnp->dn_nblkptr));
+ }
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ if (dn->dn_dirtyblksz[txgoff]) {
+ ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ dnp->dn_datablkszsec =
+ dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff]) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+ for (rp = avl_first(&dn->dn_ranges[txgoff]); rp != NULL;
+ rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp))
+ dnode_sync_free_range(dn,
+ rp->fr_blkid, rp->fr_nblks, tx);
+ }
+ mutex_enter(&dn->dn_mtx);
+ for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+ free_range_t *last = rp;
+ rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+ avl_remove(&dn->dn_ranges[txgoff], last);
+ kmem_free(last, sizeof (free_range_t));
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+ ASSERT3U(level, ==, 0);
+ return (dnode_sync_free(dn, tx));
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ int new_lvl = dn->dn_next_nlevels[txgoff];
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ while (new_lvl > dnp->dn_nlevels)
+ dnode_increase_indirection(dn, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ if (level == dnp->dn_nlevels) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+ /* we've already synced out all data and indirect blocks */
+ /* there are no more dirty dbufs under this dnode */
+ ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL);
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg);
+
+ /* XXX this is expensive. remove once 6343073 is closed. */
+ /* NB: the "off < maxblkid" is to catch overflow */
+ /*
+ * NB: if blocksize is changing, we could get confused,
+ * so only bother if there are multiple blocks and thus
+ * it can't be changing.
+ */
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+
+ dn->dn_dirtyblksz[txgoff] = 0;
+
+
+ if (!IS_DNODE_DNODE(dn->dn_object)) {
+ dbuf_will_dirty(dn->dn_dbuf, tx);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Now that we've dropped the reference, the dnode may
+ * be evicted, so we musn't access it.
+ */
+ return (1);
+ } else {
+ dmu_buf_impl_t *db, *db_next;
+ list_t *list = &dn->dn_dirty_dbufs[txgoff];
+ /*
+ * Iterate over the list, removing and sync'ing dbufs
+ * which are on the level we want, and leaving others.
+ */
+ for (db = list_head(list); db; db = db_next) {
+ db_next = list_next(list, db);
+ if (db->db_level == level) {
+ list_remove(list, db);
+ dbuf_sync(db, zio, tx);
+ }
+ }
+ return (0);
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 0000000000..ab8dcfc3e3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,1463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+
+#define DOS_REF_MAX (1ULL << 62)
+
+#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DOS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
+ * weight (DOS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+ 0, /* DOS_MODE_NONE - invalid */
+ 1, /* DOS_MODE_STANDARD - unlimited number */
+ (DOS_REF_MAX >> 1) + 1, /* DOS_MODE_PRIMARY - only one of these */
+ DOS_REF_MAX /* DOS_MODE_EXCLUSIVE - no other opens */
+};
+
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = BP_GET_ASIZE(bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ dprintf_bp(bp, "born, ds=%p\n", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+ if (ds == NULL) {
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dsl_dir.
+ */
+ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ used, compressed, uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ ds->ds_phys->ds_used_bytes += used;
+ ds->ds_phys->ds_compressed_bytes += compressed;
+ ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+ ds->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ used, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = BP_GET_ASIZE(bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ if (BP_IS_HOLE(bp))
+ return;
+
+ ASSERT(used > 0);
+ if (ds == NULL) {
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dataset.
+ */
+ /* XXX this can fail, what do we do when it does? */
+ (void) arc_free(NULL, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+ bzero(bp, sizeof (blkptr_t));
+
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ -used, -compressed, -uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ dprintf_bp(bp, "freeing: %s", "");
+ /* XXX check return code? */
+ (void) arc_free(NULL, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+
+ mutex_enter(&ds->ds_lock);
+ /* XXX unique_bytes is not accurate for head datasets */
+ /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+ ds->ds_phys->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ -used, -compressed, -uncompressed, tx);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ bplist_enqueue(&ds->ds_deadlist, bp, tx);
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object &&
+ bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes +=
+ used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ }
+ }
+ bzero(bp, sizeof (blkptr_t));
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+ ds->ds_phys->ds_used_bytes -= used;
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+ ds->ds_phys->ds_compressed_bytes -= compressed;
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+ ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+{
+ uint64_t prev_snap_txg;
+ dsl_dir_t *dd;
+ /* ASSERT that it is not a snapshot */
+ if (ds == NULL)
+ return (TRUE);
+ /*
+ * The snapshot creation could fail, but that would cause an
+ * incorrect FALSE return, which would only result in an
+ * overestimation of the amount of space that an operation would
+ * consume, which is OK.
+ *
+ * There's also a small window where we could miss a pending
+ * snapshot, because we could set the sync task in the quiescing
+ * phase. So this should only be used as a guess.
+ */
+ dd = ds->ds_dir;
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
+ dd->dd_sync_txg < tx->tx_txg)
+ prev_snap_txg = dd->dd_sync_txg;
+ else
+ prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ mutex_exit(&dd->dd_lock);
+ return (blk_birth > prev_snap_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+ dsl_dataset_t *ds = dsv;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* open_refcount == DOS_REF_MAX when deleting */
+ ASSERT(ds->ds_open_refcount == 0 ||
+ ds->ds_open_refcount == DOS_REF_MAX);
+
+ dprintf_ds(ds, "evicting %s\n", "");
+
+ unique_remove(ds->ds_phys->ds_fsid_guid);
+
+ if (ds->ds_user_ptr != NULL)
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ ds->ds_prev = NULL;
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dsl_dir_close(ds->ds_dir, ds);
+
+ if (list_link_active(&ds->ds_synced_link))
+ list_remove(&dp->dp_synced_objsets, ds);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static void
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return;
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return;
+
+ headdbuf = dmu_bonus_hold_tag(mos,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
+ dmu_buf_read(headdbuf);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+ ASSERT(err == 0);
+ dmu_buf_rele_tag(headdbuf, FTAG);
+}
+
+dsl_dataset_t *
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+ int mode, void *tag)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
+ dmu_buf_read(dbuf);
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_phys = dbuf->db_data;
+ ds->ds_dir = dsl_dir_open_obj(dp,
+ ds->ds_phys->ds_dir_obj, NULL, ds);
+
+ bplist_open(&ds->ds_deadlist,
+ mos, ds->ds_phys->ds_deadlist_obj);
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ ds->ds_snapname[0] = '\0';
+ if (ds->ds_phys->ds_prev_snap_obj) {
+ ds->ds_prev =
+ dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds);
+ }
+ } else {
+ if (snapname) {
+#ifdef ZFS_DEBUG
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
+ ds->ds_dir->dd_phys->
+ dd_head_dataset_obj, FTAG);
+ dmu_buf_read(headdbuf);
+ headphys = headdbuf->db_data;
+ uint64_t foundobj;
+ err = zap_lookup(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj,
+ snapname, sizeof (foundobj), 1, &foundobj);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(foundobj, ==, dsobj);
+ dmu_buf_rele_tag(headdbuf, FTAG);
+#endif
+ (void) strcat(ds->ds_snapname, snapname);
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ dsl_dataset_get_snapname(ds);
+ }
+ }
+
+ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+ dsl_dataset_evict);
+ if (winner) {
+ bplist_close(&ds->ds_deadlist);
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev,
+ DS_MODE_NONE, ds);
+ }
+ dsl_dir_close(ds->ds_dir, ds);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ ds = winner;
+ } else {
+ uint64_t new =
+ unique_insert(ds->ds_phys->ds_fsid_guid);
+ if (new != ds->ds_phys->ds_fsid_guid) {
+ /* XXX it won't necessarily be synced... */
+ ds->ds_phys->ds_fsid_guid = new;
+ }
+ }
+ }
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+ mutex_enter(&ds->ds_lock);
+ if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+ ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) ||
+ (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ return (NULL);
+ }
+ ds->ds_open_refcount += weight;
+ mutex_exit(&ds->ds_lock);
+
+ return (ds);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ const char *tail;
+ uint64_t obj;
+ dsl_dataset_t *ds = NULL;
+ int err = 0;
+
+ dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
+ if (dd == NULL)
+ return (ENOENT);
+
+ dp = dd->dd_pool;
+ obj = dd->dd_phys->dd_head_dataset_obj;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (obj == 0) {
+ /* A dataset with no associated objset */
+ err = ENOENT;
+ goto out;
+ }
+
+ if (tail != NULL) {
+ objset_t *mos = dp->dp_meta_objset;
+
+ ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+ obj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ ds = NULL;
+
+ if (tail[0] != '@') {
+ err = ENOENT;
+ goto out;
+ }
+ tail++;
+
+ /* Look for a snapshot */
+ if (!DS_MODE_IS_READONLY(mode)) {
+ err = EROFS;
+ goto out;
+ }
+ dprintf("looking for snapshot '%s'\n", tail);
+ err = zap_lookup(mos, obj, tail, 8, 1, &obj);
+ if (err)
+ goto out;
+ }
+ ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
+ if (ds == NULL)
+ err = EBUSY;
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(dd, FTAG);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+ /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+ *dsp = ds;
+ return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strcpy(name, "mos");
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ dsl_dataset_get_snapname(ds);
+ if (ds->ds_snapname[0]) {
+ (void) strcat(name, "@");
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ mutex_enter(&ds->ds_lock);
+ (void) strcat(name, ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ (void) strcat(name, ds->ds_snapname);
+ }
+ }
+ }
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, weight);
+ ds->ds_open_refcount -= weight;
+ dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
+ mode, ds->ds_open_refcount);
+ mutex_exit(&ds->ds_lock);
+
+ dmu_buf_rele_tag(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ dsl_dir_t *dd;
+
+ dsl_dir_create_root(mos, ddobjp, tx);
+ dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
+ ASSERT(dd != NULL);
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+ DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+ dbuf = dmu_bonus_hold(mos, dsobj);
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ dmu_buf_rele(dbuf);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+ (void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+int
+dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+{
+ int err;
+ dsl_pool_t *dp = pds->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dir_t *dd;
+
+ if (clone_parent != NULL) {
+ /*
+ * You can't clone across pools.
+ */
+ if (clone_parent->ds_dir->dd_pool != dp)
+ return (EXDEV);
+
+ /*
+ * You can only clone snapshots, not the head datasets.
+ */
+ if (clone_parent->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ }
+
+ ASSERT(lastname[0] != '@');
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ err = dsl_dir_create_sync(pds, lastname, tx);
+ if (err)
+ return (err);
+ dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
+ ASSERT(dd != NULL);
+
+ /* This is the point of no (unsuccessful) return */
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+ DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+ dbuf = dmu_bonus_hold(mos, dsobj);
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ if (clone_parent) {
+ dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+ dsphys->ds_prev_snap_txg =
+ clone_parent->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ clone_parent->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ clone_parent->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ clone_parent->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+
+ dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
+ clone_parent->ds_phys->ds_num_children++;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+ }
+ dmu_buf_rele(dbuf);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ return (0);
+}
+
+
+int
+dsl_dataset_destroy(const char *name)
+{
+ int err;
+ dsl_pool_t *dp;
+ dsl_dir_t *dd;
+ const char *tail;
+
+ dd = dsl_dir_open(name, FTAG, &tail);
+ if (dd == NULL)
+ return (ENOENT);
+
+ dp = dd->dd_pool;
+ if (tail != NULL) {
+ if (tail[0] != '@') {
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+ tail++;
+ /* Just blow away the snapshot */
+ do {
+ txg_wait_synced(dp, 0);
+ err = dsl_dir_sync_task(dd,
+ dsl_dataset_destroy_sync, (void*)tail, 0);
+ } while (err == EAGAIN);
+ dsl_dir_close(dd, FTAG);
+ } else {
+ char buf[MAXNAMELEN];
+ char *cp;
+
+ dsl_dir_t *pds;
+ if (dd->dd_phys->dd_parent_obj == 0) {
+ dsl_dir_close(dd, FTAG);
+ return (EINVAL);
+ }
+ /*
+ * Make sure it's not dirty before we destroy it.
+ */
+ txg_wait_synced(dd->dd_pool, 0);
+ /*
+ * Blow away the dsl_dir + head dataset.
+ * dsl_dir_destroy_sync() will call
+ * dsl_dataset_destroy_sync() to destroy the head dataset.
+ */
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ pds = dsl_dir_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_parent_obj, NULL, FTAG);
+ dsl_dir_close(dd, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+
+ (void) strcpy(buf, name);
+ cp = strrchr(buf, '/') + 1;
+ ASSERT(cp[0] != '\0');
+ do {
+ txg_wait_synced(dp, 0);
+ err = dsl_dir_sync_task(pds,
+ dsl_dir_destroy_sync, cp, 0);
+ } while (err == EAGAIN);
+ dsl_dir_close(pds, FTAG);
+ }
+
+ return (err);
+}
+
+int
+dsl_dataset_rollback(const char *name)
+{
+ int err;
+ dsl_dir_t *dd;
+ const char *tail;
+
+ dd = dsl_dir_open(name, FTAG, &tail);
+ if (dd == NULL)
+ return (ENOENT);
+
+ if (tail != NULL) {
+ dsl_dir_close(dd, FTAG);
+ return (EINVAL);
+ }
+ do {
+ txg_wait_synced(dd->dd_pool, 0);
+ err = dsl_dir_sync_task(dd,
+ dsl_dataset_rollback_sync, NULL, 0);
+ } while (err == EAGAIN);
+ dsl_dir_close(dd, FTAG);
+
+ return (err);
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func)
+{
+ void *old;
+
+ mutex_enter(&ds->ds_lock);
+ old = ds->ds_user_ptr;
+ if (old == NULL) {
+ ds->ds_user_ptr = p;
+ ds->ds_user_evict_func = func;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+ return (ds->ds_user_ptr);
+}
+
+
+void
+dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
+{
+ *bp = ds->ds_phys->ds_bp;
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* If it's the meta-objset, set dp_meta_rootbp */
+ if (ds == NULL) {
+ tx->tx_pool->dp_meta_rootbp = *bp;
+ } else {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_bp = *bp;
+ }
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+ dp = ds->ds_dir->dd_pool;
+
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ }
+}
+
+struct killarg {
+ uint64_t *usedp;
+ uint64_t *compressedp;
+ uint64_t *uncompressedp;
+ zio_t *zio;
+ dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct killarg *ka = arg;
+ blkptr_t *bp = &bc->bc_blkptr;
+
+ ASSERT3U(bc->bc_errno, ==, 0);
+
+ /*
+ * Since this callback is not called concurrently, no lock is
+ * needed on the accounting values.
+ */
+ *ka->usedp += BP_GET_ASIZE(bp);
+ *ka->compressedp += BP_GET_PSIZE(bp);
+ *ka->uncompressedp += BP_GET_UCSIZE(bp);
+ /* XXX check for EIO? */
+ (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+ ARC_NOWAIT);
+ return (0);
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *ds;
+
+ if (dd->dd_phys->dd_head_dataset_obj == 0)
+ return (EINVAL);
+ ds = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+ if (ds->ds_phys->ds_prev_snap_txg == 0) {
+ /*
+ * There's no previous snapshot. I suppose we could
+ * roll it back to being empty (and re-initialize the
+ * upper (ZPL) layer). But for now there's no way to do
+ * this via the user interface.
+ */
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (EINVAL);
+ }
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_open_refcount > 0) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (EBUSY);
+ }
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (EAGAIN);
+ }
+
+ /* THE POINT OF NO (unsuccessful) RETURN */
+ ds->ds_open_refcount = DOS_REF_MAX;
+ mutex_exit(&ds->ds_lock);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ /* Zero out the deadlist. */
+ dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+
+ {
+ /* Free blkptrs that we gave birth to */
+ zio_t *zio;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ struct killarg ka;
+
+ zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+ ZIO_FLAG_MUSTSUCCEED);
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ (void) zio_wait(zio);
+
+ dsl_dir_diduse_space(dd,
+ -used, -compressed, -uncompressed, tx);
+ }
+
+ /* Change our contents to that of the prev snapshot (finally!) */
+ ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring;
+ ds->ds_phys->ds_unique_bytes = 0;
+
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+
+ dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+ ds->ds_open_refcount = 0;
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+ return (0);
+}
+
+int
+dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ const char *snapname = arg;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ blkptr_t bp;
+ zio_t *zio;
+ int err;
+ int after_branch_point = FALSE;
+ int drop_lock = FALSE;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds, *ds_prev = NULL;
+ uint64_t obj;
+
+ if (dd->dd_phys->dd_head_dataset_obj == 0)
+ return (EINVAL);
+
+ if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) {
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+ drop_lock = TRUE;
+ }
+
+ ds = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL,
+ snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+
+ if (snapname) {
+ err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &obj);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ if (err) {
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (err);
+ }
+
+ ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+ DS_MODE_EXCLUSIVE, FTAG);
+ }
+ if (ds == NULL) {
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (EBUSY);
+ }
+
+ obj = ds->ds_object;
+
+ /* Can't delete a branch point. */
+ if (ds->ds_phys->ds_num_children > 1) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (EINVAL);
+ }
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == obj) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (EINVAL);
+ }
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (EAGAIN);
+ }
+
+ /* THE POINT OF NO (unsuccessful) RETURN */
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (ds->ds_prev) {
+ ds_prev = ds->ds_prev;
+ } else {
+ ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, FTAG);
+ }
+ after_branch_point =
+ (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ ds->ds_phys->ds_next_snap_obj == 0) {
+ /* This clone is toast. */
+ ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+ ds_prev->ds_phys->ds_num_children--;
+ } else if (!after_branch_point) {
+ ds_prev->ds_phys->ds_next_snap_obj =
+ ds->ds_phys->ds_next_snap_obj;
+ }
+ }
+
+ ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ dsl_dataset_t *ds_next;
+ uint64_t itor = 0;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+ ds_next = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ ds_next->ds_phys->ds_prev_snap_obj =
+ ds->ds_phys->ds_prev_snap_obj;
+ ds_next->ds_phys->ds_prev_snap_txg =
+ ds->ds_phys->ds_prev_snap_txg;
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+ /*
+ * Transfer to our deadlist (which will become next's
+ * new deadlist) any entries from next's current
+ * deadlist which were born before prev, and free the
+ * other entries.
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+ &bp) == 0) {
+ if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+ bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+ if (ds_prev && !after_branch_point &&
+ bp.blk_birth >
+ ds_prev->ds_phys->ds_prev_snap_txg) {
+ ds_prev->ds_phys->ds_unique_bytes +=
+ BP_GET_ASIZE(&bp);
+ }
+ } else {
+ used += BP_GET_ASIZE(&bp);
+ compressed += BP_GET_PSIZE(&bp);
+ uncompressed += BP_GET_UCSIZE(&bp);
+ /* XXX check return value? */
+ (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ &bp, NULL, NULL, ARC_NOWAIT);
+ }
+ }
+
+ /* free next's deadlist */
+ bplist_close(&ds_next->ds_deadlist);
+ bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+ /* set next's deadlist to our deadlist */
+ ds_next->ds_phys->ds_deadlist_obj =
+ ds->ds_phys->ds_deadlist_obj;
+ bplist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj);
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ *
+ * XXX we're doing this long task with the
+ * config lock held
+ */
+ dsl_dataset_t *ds_after_next;
+
+ ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+ ds_next->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG);
+ itor = 0;
+ while (bplist_iterate(&ds_after_next->ds_deadlist,
+ &itor, &bp) == 0) {
+ if (bp.blk_birth >
+ ds->ds_phys->ds_prev_snap_txg &&
+ bp.blk_birth <=
+ ds->ds_phys->ds_creation_txg) {
+ ds_next->ds_phys->ds_unique_bytes +=
+ BP_GET_ASIZE(&bp);
+ }
+ }
+
+ dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+ } else {
+ /*
+ * It would be nice to update the head dataset's
+ * unique. To do so we would have to traverse
+ * it for blocks born after ds_prev, which is
+ * pretty expensive just to maintain something
+ * for debugging purposes.
+ */
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+ ds_next);
+ if (ds_prev) {
+ ds_next->ds_prev = dsl_dataset_open_obj(
+ dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
+ NULL, DS_MODE_NONE, ds_next);
+ } else {
+ ds_next->ds_prev = NULL;
+ }
+ }
+ dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+ /*
+ * NB: unique_bytes is not accurate for head objsets
+ * because we don't update it when we delete the most
+ * recent snapshot -- see above comment.
+ */
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ } else {
+ /*
+ * There's no next snapshot, so this is a head dataset.
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty. (If it's a clone, it's
+ * safe to ignore the deadlist contents.)
+ */
+ struct killarg ka;
+
+ ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ ASSERT3U(err, ==, 0);
+ }
+
+ err = zio_wait(zio);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx);
+
+ if (ds->ds_phys->ds_snapnames_zapobj) {
+ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+ ASSERT(err == 0);
+ }
+
+ if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+ /* Erase the link in the dataset */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = 0;
+ /*
+ * dsl_dir_sync_destroy() called us, they'll destroy
+ * the dataset.
+ */
+ } else {
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ ds_head = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+ err = zap_lookup(mos,
+ ds_head->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &val);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
+ snapname, tx);
+ ASSERT(err == 0);
+ dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+ }
+
+ if (ds_prev && ds->ds_prev != ds_prev)
+ dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+ err = dmu_object_free(mos, obj, tx);
+ ASSERT(err == 0);
+
+ /*
+ * Close the objset with mode NONE, thus leaving it with
+ * DOS_REF_MAX set, so that noone can access it.
+ */
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (0);
+}
+
+int
+dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ const char *snapname = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj, value;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ if (dd->dd_phys->dd_head_dataset_obj == 0)
+ return (EINVAL);
+ ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG);
+
+ err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &value);
+ if (err == 0) {
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (EEXIST);
+ }
+ ASSERT(err == ENOENT);
+
+ /* The point of no (unsuccessful) return */
+
+ dprintf_dd(dd, "taking snapshot %s in txg %llu\n",
+ snapname, tx->tx_txg);
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+ DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+ dbuf = dmu_bonus_hold(mos, dsobj);
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+ dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_restoring = ds->ds_phys->ds_restoring;
+ dsphys->ds_bp = ds->ds_phys->ds_bp;
+ dmu_buf_rele(dbuf);
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *ds_prev;
+
+ ds_prev = dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+ ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object ||
+ ds_prev->ds_phys->ds_num_children > 1);
+ if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds_prev->ds_phys->ds_creation_txg);
+ ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ }
+ dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+ } else {
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0);
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+ ds->ds_phys->ds_prev_snap_obj = dsobj;
+ ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+ ds->ds_phys->ds_unique_bytes = 0;
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+
+ dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+ err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx);
+ ASSERT(err == 0);
+
+ if (ds->ds_prev)
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ ds->ds_prev = dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+ return (0);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+ dmu_objset_sync(ds->ds_user_ptr, tx);
+ dsl_dir_dirty(ds->ds_dir, tx);
+ bplist_close(&ds->ds_deadlist);
+
+ dmu_buf_remove_ref(ds->ds_dbuf, ds);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
+{
+ /* fill in properties crap */
+ dsl_dir_stats(ds->ds_dir, dds);
+
+ if (ds->ds_phys->ds_num_children != 0) {
+ dds->dds_is_snapshot = TRUE;
+ dds->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+ }
+
+ dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth;
+
+ dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill;
+ dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used;
+
+ /* We override the dataset's creation time... they should be the same */
+ dds->dds_creation_time = ds->ds_phys->ds_creation_time;
+ dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+ dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
+ dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
+ dds->dds_guid = ds->ds_phys->ds_guid;
+
+ if (ds->ds_phys->ds_next_snap_obj) {
+ /*
+ * This is a snapshot; override the dd's space used with
+ * our unique space
+ */
+ dds->dds_space_used = ds->ds_phys->ds_unique_bytes;
+ dds->dds_compressed_bytes =
+ ds->ds_phys->ds_compressed_bytes;
+ dds->dds_uncompressed_bytes =
+ ds->ds_phys->ds_uncompressed_bytes;
+ }
+
+ dds->dds_objset_obj = ds->ds_object;
+}
+
+dsl_pool_t *
+dsl_dataset_pool(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool);
+}
+
+struct osrenamearg {
+ const char *oldname;
+ const char *newname;
+};
+
+static int
+dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct osrenamearg *ora = arg;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dir_t *nds;
+ const char *tail;
+ int err;
+ dsl_dataset_t *snds, *fsds;
+ uint64_t val;
+
+ err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname,
+ DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds);
+ if (err)
+ return (err);
+
+ if (snds->ds_dir != dd) {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (EINVAL);
+ }
+
+ /* better be changing a snapshot */
+ if (snds->ds_phys->ds_next_snap_obj == 0) {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (EINVAL);
+ }
+
+ /* new fs better exist */
+ nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
+ if (nds == NULL) {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (ENOENT);
+ }
+
+ dsl_dir_close(nds, FTAG);
+
+ /* new name better be in same fs */
+ if (nds != dd) {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (EINVAL);
+ }
+
+ /* new name better be a snapshot */
+ if (tail == NULL || tail[0] != '@') {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (EINVAL);
+ }
+
+ tail++;
+
+ fsds = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+ /* new name better not be in use */
+ err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
+ tail, 8, 1, &val);
+ if (err != ENOENT) {
+ if (err == 0)
+ err = EEXIST;
+ dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (EEXIST);
+ }
+
+ /* The point of no (unsuccessful) return */
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+ dsl_dataset_get_snapname(snds);
+ err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
+ snds->ds_snapname, tx);
+ ASSERT3U(err, ==, 0);
+ mutex_enter(&snds->ds_lock);
+ (void) strcpy(snds->ds_snapname, tail);
+ mutex_exit(&snds->ds_lock);
+ err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj,
+ snds->ds_snapname, 8, 1, &snds->ds_object, tx);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (0);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(const char *osname, const char *newname)
+{
+ dsl_dir_t *dd;
+ const char *tail;
+ struct osrenamearg ora;
+ int err;
+
+ dd = dsl_dir_open(osname, FTAG, &tail);
+ if (dd == NULL)
+ return (ENOENT);
+ if (tail == NULL) {
+ err = dsl_dir_sync_task(dd,
+ dsl_dir_rename_sync, (void*)newname, 1<<12);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ if (tail[0] != '@') {
+ /* the name ended in a nonexistant component */
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ ora.oldname = osname;
+ ora.newname = newname;
+
+ err = dsl_dir_sync_task(dd,
+ dsl_dataset_snapshot_rename_sync, &ora, 1<<12);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 0000000000..3b0d32de70
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,1217 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd);
+static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static int dsl_dir_set_reservation_sync(dsl_dir_t *dd,
+ void *arg, dmu_tx_t *tx);
+static uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+ dsl_dir_t *dd = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+ ASSERT(dd->dd_sync_txg == 0);
+
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+
+ spa_close(dd->dd_pool->dp_spa, dd);
+
+ /*
+ * The props callback list should be empty since they hold the
+ * dir open.
+ */
+ list_destroy(&dd->dd_prop_cbs);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+dsl_dir_t *
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
+ dmu_buf_read(dbuf);
+ dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET);
+ }
+#endif
+ /* XXX assert bonus buffer size is correct */
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+ int err;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+ dd->dd_phys = dbuf->db_data;
+ dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+
+ list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_node));
+
+ if (dd->dd_phys->dd_parent_obj) {
+ dd->dd_parent = dsl_dir_open_obj(dp,
+ dd->dd_phys->dd_parent_obj, NULL, dd);
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ tail, sizeof (foundobj), 1, &foundobj);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(foundobj, ==, ddobj);
+#endif
+ (void) strcpy(dd->dd_myname, tail);
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ ddobj, dd->dd_myname);
+ /*
+ * The caller should be protecting this ddobj
+ * from being deleted concurrently
+ */
+ ASSERT(err == 0);
+ }
+ } else {
+ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+ }
+
+ winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+ dsl_dir_evict);
+ if (winner) {
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ return (dd);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele_tag(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ (void) strcat(buf, "/");
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ (void) strcat(buf, dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ (void) strcat(buf, dd->dd_myname);
+ }
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+ int rv = FALSE;
+
+ if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+ rv = TRUE;
+ if (dataset_name_hidden(dd->dd_myname))
+ rv = TRUE;
+ return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+ if (path == NULL)
+ return (NULL);
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (EINVAL);
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (EINVAL);
+ if (strlen(path) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(component, path);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (EINVAL);
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ } else {
+ ASSERT(!"invalid p");
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+dsl_dir_t *
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+{
+ char buf[MAXNAMELEN];
+ const char *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ uint64_t ddobj;
+ int openedspa = FALSE;
+
+ dprintf("%s\n", name);
+
+ if (name == NULL)
+ return (NULL);
+ err = getcomponent(name, buf, &next);
+ if (err)
+ return (NULL);
+ if (spa == NULL) {
+ err = spa_open(buf, &spa, FTAG);
+ if (err) {
+ dprintf("spa_open(%s) failed\n", buf);
+ return (NULL);
+ }
+ openedspa = TRUE;
+
+ /* XXX this assertion belongs in spa_open */
+ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+ }
+
+ dp = spa_get_dsl(spa);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+ while (next != NULL) {
+ dsl_dir_t *child_ds;
+ err = getcomponent(next, buf, &nextnext);
+ if (err) {
+ dsl_dir_close(dd, tag);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (NULL);
+ }
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ if (dd->dd_phys->dd_child_dir_zapobj == 0)
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dd->dd_phys->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err == ENOENT) {
+ break;
+ }
+ ASSERT(err == 0);
+
+ child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+ dsl_dir_close(dd, tag);
+ dd = child_ds;
+ next = nextnext;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_close(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ next = NULL;
+ dd = NULL;
+ }
+ if (tailp)
+ *tailp = next;
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (dd);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+dsl_dir_t *
+dsl_dir_open(const char *name, void *tag, const char **tailp)
+{
+ return (dsl_dir_open_spa(NULL, name, tag, tailp));
+}
+
+int
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = pds->dd_pool->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *dsphys;
+ dmu_buf_t *dbuf;
+ int err;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ if (pds->dd_phys->dd_child_dir_zapobj == 0) {
+ dmu_buf_will_dirty(pds->dd_dbuf, tx);
+ pds->dd_phys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ }
+
+ rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER);
+ err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj);
+ if (err != ENOENT) {
+ rw_exit(&pds->dd_pool->dp_config_rwlock);
+ return (err ? err : EEXIST);
+ }
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+ err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx);
+ ASSERT3U(err, ==, 0);
+ dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
+ name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
+
+ dbuf = dmu_bonus_hold(mos, ddobj);
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+
+ dsphys->dd_creation_time = gethrestime_sec();
+ dsphys->dd_parent_obj = pds->dd_object;
+ dsphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ dmu_buf_rele(dbuf);
+
+ rw_exit(&pds->dd_pool->dp_config_rwlock);
+
+ return (0);
+}
+
+int
+dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
+{
+ const char *name = arg;
+ dsl_dir_t *dd = NULL;
+ dsl_pool_t *dp = pds->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ uint64_t val, obj, child_zapobj, props_zapobj;
+ int t, err;
+
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+ err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name,
+ 8, 1, &obj);
+ if (err)
+ goto out;
+
+ dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+ ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
+
+ if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
+ err = EBUSY;
+ goto out;
+ }
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ /*
+ * if they were dirty, they'd also be open.
+ * dp_config_rwlock ensures that it stays that way.
+ */
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ }
+
+ child_zapobj = dd->dd_phys->dd_child_dir_zapobj;
+ props_zapobj = dd->dd_phys->dd_props_zapobj;
+
+ if (child_zapobj != 0) {
+ uint64_t count;
+ err = EEXIST;
+ (void) zap_count(mos, child_zapobj, &count);
+ if (count != 0)
+ goto out;
+ }
+
+ if (dd->dd_phys->dd_head_dataset_obj != 0) {
+ err = dsl_dataset_destroy_sync(dd, NULL, tx);
+ if (err)
+ goto out;
+ }
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+ /* The point of no (unsuccessful) return */
+
+ /* Make sure parent's used gets updated */
+ val = 0;
+ err = dsl_dir_set_reservation_sync(dd, &val, tx);
+ ASSERT(err == 0);
+ ASSERT3U(dd->dd_used_bytes, ==, 0);
+ ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+ dsl_dir_close(dd, FTAG);
+ dd = NULL;
+
+ err = dmu_object_free(mos, obj, tx);
+ ASSERT(err == 0);
+
+ if (child_zapobj)
+ err = zap_destroy(mos, child_zapobj, tx);
+ ASSERT(err == 0);
+
+ if (props_zapobj)
+ err = zap_destroy(mos, props_zapobj, tx);
+ ASSERT(err == 0);
+
+ err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx);
+ ASSERT(err == 0);
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ if (dd)
+ dsl_dir_close(dd, FTAG);
+
+ return (err);
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ dsl_dir_phys_t *dsp;
+ dmu_buf_t *dbuf;
+ int error;
+
+ *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+
+ error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+ sizeof (uint64_t), 1, ddobjp, tx);
+ ASSERT3U(error, ==, 0);
+
+ dbuf = dmu_bonus_hold(mos, *ddobjp);
+ dmu_buf_will_dirty(dbuf, tx);
+ dsp = dbuf->db_data;
+
+ dsp->dd_creation_time = gethrestime_sec();
+ dsp->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsp->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+ dmu_buf_rele(dbuf);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
+{
+ bzero(dds, sizeof (dmu_objset_stats_t));
+
+ dds->dds_dir_obj = dd->dd_object;
+ dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
+
+ mutex_enter(&dd->dd_lock);
+ dds->dds_space_used = dd->dd_used_bytes;
+ dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes;
+ dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes;
+ dds->dds_quota = dd->dd_phys->dd_quota;
+ dds->dds_reserved = dd->dd_phys->dd_reserved;
+ mutex_exit(&dd->dd_lock);
+
+ dds->dds_creation_time = dd->dd_phys->dd_creation_time;
+
+ dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
+
+ if (dd->dd_phys->dd_clone_parent_obj) {
+ dsl_dataset_t *ds;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ ds = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+ dsl_dataset_name(ds, dds->dds_clone_of);
+ dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ }
+
+ VERIFY(dsl_prop_get_ds_integer(dd, "checksum",
+ &dds->dds_checksum, dds->dds_checksum_setpoint) == 0);
+
+ VERIFY(dsl_prop_get_ds_integer(dd, "compression",
+ &dds->dds_compression, dds->dds_compression_setpoint) == 0);
+
+ VERIFY(dsl_prop_get_ds_integer(dd, "zoned",
+ &dds->dds_zoned, dds->dds_zoned_setpoint) == 0);
+
+ spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
+ sizeof (dds->dds_altroot));
+}
+
+int
+dsl_dir_sync_task(dsl_dir_t *dd,
+ int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space)
+{
+ dmu_tx_t *tx;
+ dsl_pool_t *dp = dd->dd_pool;
+ int err = 0;
+ uint64_t txg;
+
+ dprintf_dd(dd, "func=%p space=%llu\n", func, space);
+
+again:
+ tx = dmu_tx_create_ds(dd);
+ dmu_tx_hold_space(tx, space);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == ENOSPC || err == EDQUOT) {
+ dsl_dir_t *rds;
+ /*
+ * They can get their space from either this dd, or the
+ * root dd.
+ */
+ for (rds = dd; rds->dd_parent; rds = rds->dd_parent)
+ continue;
+ dmu_tx_abort(tx);
+ tx = dmu_tx_create_ds(rds);
+ dmu_tx_hold_space(tx, space);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ }
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ txg = dmu_tx_get_txg(tx);
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_sync_txg != 0) {
+ mutex_exit(&dd->dd_lock);
+ dmu_tx_commit(tx);
+ txg_wait_synced(dp, 0);
+ goto again;
+ }
+
+ /* We're good to go */
+
+ dd->dd_sync_txg = txg;
+ dd->dd_sync_func = func;
+ dd->dd_sync_arg = arg;
+
+ mutex_exit(&dd->dd_lock);
+
+ dsl_dir_dirty(dd, tx);
+ dmu_tx_commit(tx);
+
+ txg_wait_synced(dp, txg);
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT(dd->dd_sync_txg == txg);
+ ASSERT(dd->dd_sync_func == NULL);
+ err = dd->dd_sync_err;
+ dd->dd_sync_txg = 0;
+ mutex_exit(&dd->dd_lock);
+
+ return (err);
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dd->dd_phys);
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+ uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) {
+ dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx);
+ dd->dd_sync_func = NULL;
+ }
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+ dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_remove_ref(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_estimated_space(dsl_dir_t *dd)
+{
+ int64_t space;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ space = dd->dd_used_bytes;
+ ASSERT(space >= 0);
+ for (i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i&TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+static uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_phys->dd_quota != 0)
+ quota = dd->dd_phys->dd_quota;
+ if (ondiskonly) {
+ used = dd->dd_used_bytes;
+ } else {
+ used = dsl_dir_estimated_space(dd);
+ }
+ if (dd == ancestor)
+ used += delta;
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dd->dd_phys->dd_reserved - used;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+#ifdef ZFS_DEBUG
+ {
+ /*
+ * While it's OK to be a little over quota, if
+ * we think we are using more space than there
+ * is in the pool (which is already 6% more than
+ * dsl_pool_adjustedsize()), something is very
+ * wrong.
+ */
+ uint64_t space = spa_get_space(dd->dd_pool->dp_spa);
+ ASSERT3U(used, <=, space);
+ }
+#endif
+ } else {
+ /*
+ * the lesser of parent's space and the space
+ * left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd,
+ uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+{
+ uint64_t txg = tx->tx_txg;
+ uint64_t est_used, quota, parent_rsrv;
+ int edquot = EDQUOT;
+ int txgidx = txg & TXG_MASK;
+ int i;
+ struct tempreserve *tr;
+
+ ASSERT3U(txg, !=, 0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ est_used = dsl_dir_estimated_space(dd);
+ for (i = 0; i < TXG_SIZE; i++)
+ est_used += dd->dd_tempreserved[i];
+
+ quota = UINT64_MAX;
+
+ if (dd->dd_phys->dd_quota)
+ quota = dd->dd_phys->dd_quota;
+
+ /*
+ * If this transaction will result in a net free of space, we want
+ * to let it through, but we have to be careful: the space that it
+ * frees won't become available until *after* this txg syncs.
+ * Therefore, to ensure that it's possible to remove files from
+ * a full pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+ if (poolsize < quota) {
+ quota = poolsize;
+ edquot = ENOSPC;
+ }
+ } else if (netfree) {
+ quota = UINT64_MAX;
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota. They get to try again unless the actual
+ * on-disk is over quota.
+ */
+ if (asize > 0 && est_used > quota) {
+ if (dd->dd_used_bytes < quota)
+ edquot = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ dd->dd_used_bytes>>10, est_used>>10,
+ quota>>10, asize>>10, edquot);
+ mutex_exit(&dd->dd_lock);
+ return (edquot);
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txgidx] += asize;
+
+ parent_rsrv = parent_delta(dd, est_used, asize);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent && parent_rsrv) {
+ return (dsl_dir_tempreserve_impl(dd->dd_parent,
+ parent_rsrv, netfree, tr_list, tx));
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err = 0;
+ list_t *tr_list;
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ tr_list, tx);
+
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ err = arc_tempreserve_space(lsize);
+ if (err == 0) {
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = NULL;
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+ }
+ }
+
+ if (err)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ while (tr = list_head(tr_list)) {
+ if (tr->tr_ds == NULL) {
+ arc_tempreserve_clear(tr->tr_size);
+ } else {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data. Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_estimated_space(dd);
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ /* XXX this is potentially expensive and unnecessary... */
+ if (parent_space && dd->dd_parent)
+ dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_dirty(dd, tx);
+
+ mutex_enter(&dd->dd_lock);
+ accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+ ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dd->dd_phys->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+ dd->dd_used_bytes += used;
+ if (used > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
+ dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+ dd->dd_phys->dd_compressed_bytes += compressed;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent,
+ accounted_delta, compressed, uncompressed, tx);
+ }
+}
+
+static int
+dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ uint64_t *quotap = arg;
+ uint64_t new_quota = *quotap;
+ int err = 0;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved ||
+ new_quota < dsl_dir_estimated_space(dd))) {
+ err = ENOSPC;
+ } else {
+ dd->dd_phys->dd_quota = new_quota;
+ }
+ mutex_exit(&dd->dd_lock);
+ return (err);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ dd = dsl_dir_open(ddname, FTAG, NULL);
+ if (dd == NULL)
+ return (ENOENT);
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, &quota, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+static int
+dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ uint64_t *reservationp = arg;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used, avail;
+ int64_t delta;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+ }
+
+ if (delta > 0 && delta > avail)
+ return (ENOSPC);
+ if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+ new_reservation > dd->dd_phys->dd_quota)
+ return (ENOSPC);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_reserved = new_reservation;
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+ }
+ return (0);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ dd = dsl_dir_open(ddname, FTAG, NULL);
+ if (dd == NULL)
+ return (ENOENT);
+ err = dsl_dir_sync_task(dd,
+ dsl_dir_set_reservation_sync, &reservation, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dd->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+int
+dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ const char *newname = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dir_t *newpds;
+ const char *tail;
+ int err, len;
+
+ /* can't rename to different pool */
+ len = strlen(dp->dp_root_dir->dd_myname);
+ if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) ||
+ newname[len] != '/') {
+ return (ENXIO);
+ }
+
+ newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
+
+ /* new parent should exist */
+ if (newpds == NULL)
+ return (ENOENT);
+
+ /* new name should not already exist */
+ if (tail == NULL) {
+ dsl_dir_close(newpds, FTAG);
+ return (EEXIST);
+ }
+
+ rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+ /* There should be 2 references: the open and the dirty */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2) {
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(newpds, FTAG);
+ return (EBUSY);
+ }
+
+ if (newpds != dd->dd_parent) {
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t myspace, avail;
+
+ ancestor = closest_common_ancestor(dd, newpds);
+
+ /* no rename into our descendent */
+ if (ancestor == dd) {
+ dsl_dir_close(newpds, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ return (EINVAL);
+ }
+
+ myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+ adelta = would_change(dd->dd_parent, -myspace, ancestor);
+ avail = dsl_dir_space_available(newpds,
+ ancestor, adelta, FALSE);
+ if (avail < myspace) {
+ dsl_dir_close(newpds, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ return (ENOSPC);
+ }
+
+ /* The point of no (unsuccessful) return */
+
+ dsl_dir_diduse_space(dd->dd_parent, -myspace,
+ -dd->dd_phys->dd_compressed_bytes,
+ -dd->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(newpds, myspace,
+ dd->dd_phys->dd_compressed_bytes,
+ dd->dd_phys->dd_uncompressed_bytes, tx);
+ }
+
+ /* The point of no (unsuccessful) return */
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, tx);
+ ASSERT3U(err, ==, 0);
+
+ (void) strcpy(dd->dd_myname, tail);
+ dsl_dir_close(dd->dd_parent, dd);
+ dd->dd_phys->dd_parent_obj = newpds->dd_object;
+ dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
+ newpds->dd_object, NULL, dd);
+
+ /* add to new parent zapobj */
+ err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dir_close(newpds, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 0000000000..5b71ccfaa9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+/* internal reserved dir name */
+#define MOS_DIR_NAME "$MOS"
+
+static dsl_dir_t *
+dsl_pool_open_mos_dir(dsl_pool_t *dp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+ MOS_DIR_NAME, sizeof (obj), 1, &obj);
+ ASSERT3U(err, ==, 0);
+
+ return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ txg_init(dp, txg);
+
+ txg_list_create(&dp->dp_dirty_datasets,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ return (dp);
+}
+
+dsl_pool_t *
+dsl_pool_open(spa_t *spa, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+ dp->dp_meta_objset =
+ &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ ASSERT3U(err, ==, 0);
+
+ dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp);
+ dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+ rw_exit(&dp->dp_config_rwlock);
+
+ return (dp);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /* drop our reference from dsl_pool_open() */
+ dsl_dir_close(dp->dp_mos_dir, dp);
+ dsl_dir_close(dp->dp_root_dir, dp);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+ list_destroy(&dp->dp_synced_objsets);
+
+ arc_flush();
+ txg_fini(dp);
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+ NULL, DMU_OST_META, tx)->os;
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT3U(err, ==, 0);
+
+ /* create and open the root dir */
+ dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+ dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp);
+
+ /* create and open the meta-objset dir */
+ err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
+ tx);
+ ASSERT3U(err, ==, 0);
+ dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+
+ dmu_tx_commit(tx);
+
+ return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ objset_impl_t *mosi = dp->dp_meta_objset->os;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ do {
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ if (!list_link_active(&ds->ds_synced_link))
+ list_insert_tail(&dp->dp_synced_objsets, ds);
+ dsl_dataset_sync(ds, tx);
+ }
+ while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ dsl_dir_sync(dd, tx);
+ /*
+ * We need to loop since dsl_dir_sync() could create a
+ * new (dirty) objset.
+ * XXX - isn't this taken care of by the spa's sync to
+ * convergence loop?
+ */
+ } while (!txg_list_empty(&dp->dp_dirty_datasets, txg));
+
+ if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ dmu_objset_sync(mosi, tx);
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+ dsl_dataset_t *ds;
+
+ while (ds = list_head(&dp->dp_synced_objsets)) {
+ list_remove(&dp->dp_synced_objsets, ds);
+ ASSERT(ds->ds_user_ptr != NULL);
+ zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ }
+}
+
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ /*
+ * Yeah, this is cheesy. But the SPA needs some way to let
+ * the sync threads invoke spa_open() and spa_close() while
+ * it holds the namespace lock. I'm certainly open to better
+ * ideas for how to determine whether the current thread is
+ * operating on behalf of spa_sync(). This works for now.
+ */
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ BP_IS_HOLE(&dp->dp_meta_rootbp));
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+ uint64_t space, resv;
+
+ /*
+ * Reserve about 1% (1/128), or at least 16MB, for allocation
+ * efficiency.
+ * XXX The intent log is not accounted for, so it must fit
+ * within this slop.
+ *
+ * If we're trying to assess whether it's OK to do a free,
+ * cut the reservation in half to allow forward progress
+ * (e.g. make it possible to rm(1) files from a full pool).
+ */
+ space = spa_get_space(dp->dp_spa);
+ resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+ if (netfree)
+ resv >>= 1;
+
+ return (space - resv);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 0000000000..bd54263507
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+ zfs_prop_t prop;
+
+ if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
+ zfs_prop_readonly(prop))
+ return (ENOENT);
+
+ if (zfs_prop_get_type(prop) == prop_type_string) {
+ if (intsz != 1)
+ return (EOVERFLOW);
+ zfs_prop_default_string(prop, buf, numint);
+ } else {
+ if (intsz != 8 || numint < 1)
+ return (EOVERFLOW);
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
+ int intsz, int numint, void *buf, char *setpoint)
+{
+ int err = 0;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
+
+ while (ddobj != 0) {
+ dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ propname, intsz, numint, buf);
+ if (err != ENOENT) {
+ if (setpoint)
+ dsl_dir_name(dd, setpoint);
+ dsl_dir_close(dd, FTAG);
+ break;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ ddobj = dd->dd_phys->dd_parent_obj;
+ dsl_dir_close(dd, FTAG);
+ }
+ if (err == ENOENT)
+ err = dodefault(propname, intsz, numint, buf);
+
+ return (err);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd;
+ uint64_t value;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+
+ dd = ds->ds_dir;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object, propname,
+ 8, 1, &value, NULL);
+ if (err == ENOENT) {
+ err = 0;
+ value = DSL_PROP_VALUE_UNDEFINED;
+ }
+ if (err != 0) {
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ return (err);
+ }
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+ (void) strcpy((char *)cbr->cbr_propname, propname);
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+ mutex_enter(&dd->dd_lock);
+ list_insert_head(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ (void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ /* Leave dataset open until this callback is unregistered */
+ return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ int err;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object,
+ propname, intsz, numints, buf, setpoint);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ return (err);
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ dsl_dir_t *dd;
+ const char *tail;
+ int err;
+
+ dd = dsl_dir_open(ddname, FTAG, &tail);
+ if (dd == NULL)
+ return (ENOENT);
+ if (tail && tail[0] != '@') {
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Return 0 on success, ENOENT if ddname is invalid, EOVERFLOW if
+ * valuelen not big enough.
+ */
+int
+dsl_prop_get_string(const char *ddname, const char *propname,
+ char *value, int valuelen, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 1, valuelen, value, setpoint));
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get_ds(dd, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback. Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd;
+ dsl_prop_cb_record_t *cbr;
+
+ dd = ds->ds_dir;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_propname, propname) == 0 &&
+ cbr->cbr_func == callback &&
+ cbr->cbr_arg == cbarg)
+ break;
+ }
+
+ if (cbr == NULL) {
+ mutex_exit(&dd->dd_lock);
+ return (ENOMSG);
+ }
+
+ list_remove(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+ kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+ /* Clean up from dsl_prop_register */
+ dsl_dir_close(dd, cbr);
+ return (0);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ 8, 1, &value);
+ if (err == 0) {
+ dsl_dir_close(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_propname, propname) == 0) {
+ cbr->cbr_func(cbr->cbr_arg, value);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_phys->dd_child_dir_zapobj) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, mos,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ /* XXX recursion could blow stack; esp. za! */
+ dsl_prop_changed_notify(dp, za.za_first_integer,
+ propname, value, FALSE);
+ }
+ }
+ dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+ const char *name;
+ int intsz;
+ int numints;
+ const void *buf;
+};
+
+static int
+dsl_prop_set_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+ struct prop_set_arg *psa = arg;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+ uint64_t intval;
+ int err, isint;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+
+ isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+ if (psa->numints == 0) {
+ err = zap_remove(mos, zapobj, psa->name, tx);
+ if (err == ENOENT) /* that's fine. */
+ err = 0;
+ if (err == 0 && isint) {
+ err = dsl_prop_get_impl(dd->dd_pool,
+ dd->dd_phys->dd_parent_obj, psa->name,
+ 8, 1, &intval, NULL);
+ }
+ } else {
+ err = zap_update(mos, zapobj, psa->name,
+ psa->intsz, psa->numints, psa->buf, tx);
+ if (isint)
+ intval = *(uint64_t *)psa->buf;
+ }
+
+ if (err == 0 && isint) {
+ dsl_prop_changed_notify(dd->dd_pool,
+ dd->dd_object, psa->name, intval, TRUE);
+ }
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ return (err);
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ dsl_dir_t *dd;
+ int err;
+ struct prop_set_arg psa;
+
+ dd = dsl_dir_open(ddname, FTAG, NULL);
+ if (dd == NULL)
+ return (ENOENT);
+
+ psa.name = propname;
+ psa.intsz = intsz;
+ psa.numints = numints;
+ psa.buf = buf;
+ err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+
+ dsl_dir_close(dd, FTAG);
+
+ return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
new file mode 100644
index 0000000000..03186d1387
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/lzjb.c b/usr/src/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 0000000000..5979a55ef7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This is stolen from common/os/compress.c and will be removed once
+ * our changes have made it into the on10 source base.
+ *
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/types.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 256
+
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy, *copymap;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset;
+ uint16_t *hp;
+ uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+ if (d_len != s_len)
+ return (s_len);
+ mlen = s_len;
+ for (src = s_start, dst = d_start; mlen; mlen--)
+ *dst++ = *src++;
+ return (s_len);
+ }
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+ (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy, copymap;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 0000000000..9d682e4990
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+ mc->mc_rotor = NULL;
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+
+ while ((mg = mc->mc_rotor) != NULL) {
+ metaslab_class_remove(mc, mg);
+ metaslab_group_destroy(mg);
+ }
+
+ kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == NULL);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+ mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == mc);
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+ mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = x1;
+ const metaslab_t *m2 = x2;
+
+ if (m1->ms_weight < m2->ms_weight)
+ return (1);
+ if (m1->ms_weight > m2->ms_weight)
+ return (-1);
+
+ /*
+ * If the weights are identical, use the offset to force uniqueness.
+ */
+ if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+ return (-1);
+ if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+ return (1);
+
+ ASSERT3P(m1, ==, m2);
+
+ return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */
+ mg->mg_vd = vd;
+ metaslab_class_add(mc, mg);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ avl_destroy(&mg->mg_metaslab_tree);
+ mutex_destroy(&mg->mg_lock);
+ kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == NULL);
+ msp->ms_group = mg;
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+void
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
+ uint64_t start, uint64_t size, uint64_t txg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_t *msp;
+ int fm;
+
+ msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+
+ msp->ms_smo = smo;
+
+ space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
+ &msp->ms_lock);
+
+ for (fm = 0; fm < TXG_SIZE; fm++) {
+ space_map_create(&msp->ms_allocmap[fm], start, size,
+ vd->vdev_ashift, &msp->ms_lock);
+ space_map_create(&msp->ms_freemap[fm], start, size,
+ vd->vdev_ashift, &msp->ms_lock);
+ }
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ * We enforce this by assigning an initial weight of 0 to new space.
+ *
+ * (Transactional allocations for this txg would actually be OK;
+ * it's intent log allocations that cause trouble. If we wrote
+ * a log block in this txg and lost power, the log replay would be
+ * based on the DVA translations that had been synced in txg - 1.
+ * Those translations would not include this metaslab's vdev.)
+ */
+ metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
+
+ if (txg == 0) {
+ /*
+ * We're opening the pool. Make the metaslab's
+ * free space available immediately.
+ */
+ vdev_space_update(vd, size, smo->smo_alloc);
+ metaslab_sync_done(msp, 0);
+ } else {
+ /*
+ * We're adding a new metaslab to an already-open pool.
+ * Declare all of the metaslab's space to be free.
+ *
+ * Note that older transaction groups cannot allocate
+ * from this metaslab until its existence is committed,
+ * because we set ms_last_alloc to the current txg.
+ */
+ smo->smo_alloc = 0;
+ msp->ms_usable_space = size;
+ mutex_enter(&msp->ms_lock);
+ space_map_add(&msp->ms_map, start, size);
+ msp->ms_map_incore = 1;
+ mutex_exit(&msp->ms_lock);
+
+ /* XXX -- we'll need a call to picker_init here */
+ msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
+ msp->ms_last_alloc = txg;
+ vdev_dirty(vd, VDD_ADD, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+ }
+
+ *mspp = msp;
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ int fm;
+ metaslab_group_t *mg = msp->ms_group;
+
+ vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+ -msp->ms_smo->smo_alloc);
+
+ metaslab_group_remove(mg, msp);
+
+ /* XXX -- we'll need a call to picker_fini here */
+
+ mutex_enter(&msp->ms_lock);
+
+ space_map_vacate(&msp->ms_map, NULL, NULL);
+ msp->ms_map_incore = 0;
+ space_map_destroy(&msp->ms_map);
+
+ for (fm = 0; fm < TXG_SIZE; fm++) {
+ space_map_destroy(&msp->ms_allocmap[fm]);
+ space_map_destroy(&msp->ms_freemap[fm]);
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *os = spa->spa_meta_objset;
+ space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+ space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_obj_t *smo = msp->ms_smo;
+ uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+ uint64_t alloc_delta;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+
+ dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
+
+ mutex_enter(&msp->ms_lock);
+
+ if (*dirty & MSD_ADD)
+ vdev_space_update(vd, msp->ms_map.sm_size, 0);
+
+ if (*dirty & (MSD_ALLOC | MSD_FREE)) {
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ smo->smo_object = dmu_object_alloc(os,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
+ (msp->ms_map.sm_start >> vd->vdev_ms_shift),
+ sizeof (uint64_t), &smo->smo_object, tx);
+ }
+
+ alloc_delta = allocmap->sm_space - freemap->sm_space;
+ vdev_space_update(vd, 0, alloc_delta);
+ smo->smo_alloc += alloc_delta;
+
+ if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
+ (*dirty & MSD_CONDENSE) == 0) {
+ space_map_t *sm = &msp->ms_map;
+ space_map_t *tsm;
+ int i;
+
+ ASSERT(msp->ms_map_incore);
+
+ space_map_merge(freemap, freed_map);
+ space_map_vacate(allocmap, NULL, NULL);
+
+ /*
+ * Write out the current state of the allocation
+ * world. The current metaslab is full, minus
+ * stuff that's been freed this txg (freed_map),
+ * minus allocations from txgs in the future.
+ */
+ space_map_add(sm, sm->sm_start, sm->sm_size);
+ for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
+ tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
+ space_map_iterate(tsm, space_map_remove, sm);
+ }
+ space_map_iterate(freed_map, space_map_remove, sm);
+
+ space_map_write(sm, smo, os, tx);
+
+ ASSERT(sm->sm_space == 0);
+ ASSERT(freemap->sm_space == 0);
+ ASSERT(allocmap->sm_space == 0);
+
+ *dirty |= MSD_CONDENSE;
+ } else {
+ space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
+ space_map_sync(freemap, freed_map, smo, SM_FREE,
+ os, tx);
+ }
+
+ db = dmu_bonus_hold(os, smo->smo_object);
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db);
+
+ dmu_tx_commit(tx);
+ }
+
+ *dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
+
+ mutex_exit(&msp->ms_lock);
+
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ uint64_t weight;
+ uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+ space_map_obj_t *smo = msp->ms_smo;
+
+ dprintf("%s offset %llx txg %llu\n",
+ vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
+
+ mutex_enter(&msp->ms_lock);
+
+ ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
+
+ msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
+ msp->ms_usable_end = smo->smo_objsize;
+
+ weight = msp->ms_usable_space;
+
+ if (txg != 0) {
+ space_map_t *freed_map =
+ &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+
+ /* XXX -- we'll need a call to picker_fini here */
+
+ /* If we're empty, don't bother sticking around */
+ if (msp->ms_usable_space == 0) {
+ space_map_vacate(&msp->ms_map, NULL, NULL);
+ msp->ms_map_incore = 0;
+ ASSERT3U(freed_map->sm_space, ==, 0);
+ weight = 0;
+ } else {
+ /* Add the freed blocks to the available space map */
+ if (msp->ms_map_incore)
+ space_map_merge(freed_map, &msp->ms_map);
+ else
+ space_map_vacate(freed_map, NULL, NULL);
+ weight += msp->ms_map.sm_size;
+ }
+
+ if (msp->ms_last_alloc == txg)
+ /* Safe to use for allocation now */
+ msp->ms_last_alloc = 0;
+
+ *dirty = 0;
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ metaslab_group_sort(msp->ms_group, msp, weight);
+}
+
+/*
+ * The first-fit block picker. No picker_init or picker_fini,
+ * this is just an experiment to see how it feels to separate out
+ * the block selection policy from the map updates.
+ * Note: the 'cursor' argument is a form of PPD.
+ */
+static uint64_t
+metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ space_seg_t *ss, ssearch;
+ avl_index_t where;
+ int tried_once = 0;
+
+again:
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ while (ss != NULL) {
+ uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+ if (offset + size <= ss->ss_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ ss = AVL_NEXT(t, ss);
+ }
+
+ /* If we couldn't find a block after cursor, search again */
+ if (tried_once == 0) {
+ tried_once = 1;
+ *cursor = 0;
+ goto again;
+ }
+
+ return (-1ULL);
+}
+
+static uint64_t
+metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+ space_map_t *sm = &msp->ms_map;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ uint64_t offset;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_map_incore);
+ ASSERT(sm->sm_space != 0);
+ ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
+
+ offset = metaslab_pick_block(sm, size,
+ &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
+ if (offset != -1ULL) {
+ space_map_remove(sm, offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+ }
+ return (offset);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+int
+metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ objset_t *os = spa->spa_meta_objset;
+ vdev_t *vd;
+ metaslab_t *msp;
+ space_map_t *sm;
+ space_map_obj_t *smo;
+ int error;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+ return (ENXIO);
+
+ if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ sm = &msp->ms_map;
+ smo = msp->ms_smo;
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ if (msp->ms_map_incore == 0) {
+ error = space_map_load(sm, smo, SM_FREE, os,
+ msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
+ ASSERT(error == 0);
+ if (error) {
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+ msp->ms_map_incore = 1;
+ /* XXX -- we'll need a call to picker_init here */
+ bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+ }
+
+ space_map_remove(sm, offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+ msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+ msp->ms_last_alloc = txg;
+ vdev_dirty(vd, VDD_ALLOC, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+ }
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+static int
+metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+ /*
+ * Enforce segregation across transaction groups.
+ */
+ /* XXX -- We should probably not assume we know what ms_weight means */
+ if (msp->ms_last_alloc == txg)
+ return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
+
+ if (msp->ms_last_alloc != 0)
+ return (0);
+
+ if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
+ return (1);
+
+ /* XXX -- the weight test should be in terms of MINFREE */
+ return (msp->ms_usable_space >= size && msp->ms_weight >= size);
+}
+
+static metaslab_t *
+metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
+{
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+
+ mutex_enter(&mg->mg_lock);
+ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
+ if (metaslab_usable(msp, size, txg))
+ break;
+ mutex_exit(&mg->mg_lock);
+
+ return (msp);
+}
+
+static metaslab_t *
+metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
+ uint64_t *offp, uint64_t txg)
+{
+ metaslab_t *msp;
+ int error;
+
+ while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
+ space_map_obj_t *smo = msp->ms_smo;
+ mutex_enter(&msp->ms_lock);
+ if (!metaslab_usable(msp, size, txg)) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+ if (msp->ms_map_incore == 0) {
+ error = space_map_load(&msp->ms_map, smo, SM_FREE,
+ spa->spa_meta_objset, msp->ms_usable_end,
+ msp->ms_map.sm_size - msp->ms_usable_space);
+ ASSERT(error == 0);
+ if (error) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_group_sort(mg, msp, 0);
+ continue;
+ }
+ msp->ms_map_incore = 1;
+ /* XXX -- we'll need a call to picker_init here */
+ bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+ }
+ *offp = metaslab_getblock(msp, size, txg);
+ if (*offp != -1ULL) {
+ if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+ vdev_t *vd = mg->mg_vd;
+ msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+ msp->ms_last_alloc = txg;
+ vdev_dirty(vd, VDD_ALLOC, txg);
+ (void) txg_list_add(&vd->vdev_ms_list,
+ msp, txg);
+ }
+ mutex_exit(&msp->ms_lock);
+ return (msp);
+ }
+ mutex_exit(&msp->ms_lock);
+ metaslab_group_sort(msp->ms_group, msp, size - 1);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+{
+ metaslab_t *msp;
+ metaslab_group_t *mg, *rotor;
+ metaslab_class_t *mc;
+ vdev_t *vd;
+ uint64_t offset = -1ULL;
+ uint64_t asize;
+
+ mc = spa_metaslab_class_select(spa);
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mc_rotor or mc_allocated because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ */
+ mg = rotor = mc->mc_rotor;
+ do {
+ vd = mg->mg_vd;
+ asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
+ if (msp != NULL) {
+ ASSERT(offset != -1ULL);
+
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ */
+ if (mc->mc_allocated == 0) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ uint64_t alloc, space;
+ int64_t vu, su;
+
+ alloc = spa_get_alloc(spa);
+ space = spa_get_space(spa);
+
+ /*
+ * Determine percent used in units of 0..1024.
+ * (This is just to avoid floating point.)
+ */
+ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+ su = (alloc << 10) / (space + 1);
+
+ /*
+ * Bias by at most +/- 25% of the aliquot.
+ */
+ mg->mg_bias = ((su - vu) *
+ (int64_t)mg->mg_aliquot) / (1024 * 4);
+
+ dprintf("bias = %lld\n", mg->mg_bias);
+ }
+
+ if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ }
+
+ DVA_SET_VDEV(dva, vd->vdev_id);
+ DVA_SET_OFFSET(dva, offset);
+ DVA_SET_GANG(dva, 0);
+ DVA_SET_ASIZE(dva, asize);
+
+ return (0);
+ }
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
+
+ DVA_SET_VDEV(dva, 0);
+ DVA_SET_OFFSET(dva, 0);
+ DVA_SET_GANG(dva, 0);
+
+ return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+void
+metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+ cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
+ (u_longlong_t)vdev);
+ ASSERT(0);
+ return;
+ }
+
+ if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
+ (u_longlong_t)offset);
+ ASSERT(0);
+ return;
+ }
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
+ msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
+ vdev_dirty(vd, VDD_FREE, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+ }
+
+ space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
new file mode 100644
index 0000000000..411ed46e13
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT(rc->rc_count == number);
+ while (ref = list_head(&rc->rc_list)) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while (ref = list_head(&rc->rc_removed)) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+ refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ if (reference_tracking_enable) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= 0);
+ if (reference_tracking_enable)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+ return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= number);
+
+ if (!reference_tracking_enable) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count >= reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+ return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c
new file mode 100644
index 0000000000..ce5c26131a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+ * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
+#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+ for (t = 0; t < 16; t++, cp += 4)
+ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+ for (t = 16; t < 64; t++)
+ W[t] = sigma1(W[t - 2]) + W[t - 7] +
+ sigma0(W[t - 15]) + W[t - 16];
+
+ a = H[0]; b = H[1]; c = H[2]; d = H[3];
+ e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+ for (t = 0; t < 64; t++) {
+ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+ T2 = SIGMA0(a) + Maj(a, b, c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+ H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+ uint8_t pad[128];
+ int padsize = size & 63;
+ int i;
+
+ for (i = 0; i < size - padsize; i += 64)
+ SHA256Transform(H, (uint8_t *)buf + i);
+
+ for (i = 0; i < padsize; i++)
+ pad[i] = ((uint8_t *)buf)[i];
+
+ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+ pad[padsize] = 0;
+
+ for (i = 0; i < 8; i++)
+ pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+ for (i = 0; i < padsize; i += 64)
+ SHA256Transform(H, pad + i);
+
+ ZIO_SET_CHECKSUM(zcp,
+ (uint64_t)H[0] << 32 | H[1],
+ (uint64_t)H[2] << 32 | H[3],
+ (uint64_t)H[4] << 32 | H[5],
+ (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
new file mode 100644
index 0000000000..43112d9319
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -0,0 +1,1784 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/callb.h>
+
+static uint32_t spa_active_count;
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+
+ spa->spa_normal_class = metaslab_class_create();
+
+ spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
+ 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+ 8, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+ 8, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ }
+
+ rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+ list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_dirty_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list,
+ offsetof(struct vdev, vdev_txg_node));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_dirty_list);
+
+ rw_destroy(&spa->spa_traverse_lock);
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ taskq_destroy(spa->spa_zio_issue_taskq[t]);
+ taskq_destroy(spa->spa_zio_intr_taskq[t]);
+ spa->spa_zio_issue_taskq[t] = NULL;
+ spa->spa_zio_intr_taskq[t] = NULL;
+ }
+
+ taskq_destroy(spa->spa_vdev_retry_taskq);
+ spa->spa_vdev_retry_taskq = NULL;
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static vdev_t *
+spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_t *vd;
+
+ if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL)
+ return (NULL);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (vd);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ vdev_free(vd);
+ return (NULL);
+ }
+
+ for (c = 0; c < children; c++) {
+ if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) {
+ vdev_free(vd);
+ return (NULL);
+ }
+ }
+
+ return (vd);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * Wait for any outstanding prefetch I/O to complete.
+ */
+ spa_config_enter(spa, RW_WRITER);
+ spa_config_exit(spa);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ }
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev) {
+ vdev_free(spa->spa_root_vdev);
+ spa->spa_root_vdev = NULL;
+ }
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information. The 'readonly' flag will prevent us
+ * from writing any updated state to disk, and can be use when testing a pool
+ * for import.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t pool_guid;
+ zio_t *zio;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+ return (EINVAL);
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if (import && spa_guid_exists(pool_guid, 0))
+ return (EEXIST);
+
+ /*
+ * Parse the configuration into a vdev tree.
+ */
+ spa_config_enter(spa, RW_WRITER);
+ rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ spa_config_exit(spa);
+
+ if (rvd == NULL)
+ return (EINVAL);
+
+ spa->spa_root_vdev = rvd;
+ ASSERT(spa_guid(spa) == pool_guid);
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ if (vdev_open(rvd) != 0)
+ return (ENXIO);
+
+ /*
+ * Find the best uberblock.
+ */
+ bzero(ub, sizeof (uberblock_t));
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ vdev_uberblock_load(zio, rvd, ub);
+ error = zio_wait(zio);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ dprintf("ub_txg is zero\n");
+ return (ENXIO);
+ }
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration.
+ */
+ if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+ rvd->vdev_state = VDEV_STATE_CANT_OPEN;
+ rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
+ dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
+ rvd->vdev_guid_sum, ub->ub_guid_sum);
+ return (ENXIO);
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ VERIFY(zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+
+ if (!mosconfig) {
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ nvlist_t *newconfig = NULL;
+
+ db = dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object);
+ dmu_buf_read(db);
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read_canfail(spa->spa_meta_objset,
+ spa->spa_config_object, 0, nvsize, packed);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, &newconfig, 0);
+ kmem_free(packed, nvsize);
+
+ if (error)
+ return (ENXIO);
+
+ spa_config_set(spa, newconfig);
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa);
+
+ return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+ }
+
+ VERIFY(zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+
+ /*
+ * Load the vdev state for all top level vdevs.
+ */
+ if ((error = vdev_load(rvd, import)) != 0)
+ return (error);
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the tree.
+ */
+ spa_config_enter(spa, RW_WRITER);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ spa_config_exit(spa);
+
+ /*
+ * Check the state of the root vdev. If it can't be opened, it
+ * indicates one or more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+ return (ENXIO);
+
+ /*
+ * Claim log blocks that haven't been committed yet, and update all
+ * top-level vdevs to sync any config changes found in vdev_load().
+ * This must all happen in a single txg.
+ */
+ if ((spa_mode & FWRITE) && !readonly) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+ spa_first_txg(spa));
+ dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
+ vdev_config_dirty(rvd);
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * Wait for all claims to sync.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ }
+
+ return (0);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+ spa_t *spa;
+ int error;
+ int loaded = B_FALSE;
+ int locked = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+ spa_activate(spa);
+
+ error = spa_load(spa, spa->spa_config,
+ B_FALSE, B_FALSE, B_FALSE);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_load() returns EBADF, it indicates that one
+ * of the vdevs indicates that the pool has been
+ * exported or destroyed. If this is the case, the
+ * config cache is out of sync and we should remove the
+ * pool from the namespace.
+ */
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ spa_config_sync();
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ } if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_root_vdev != NULL)
+ *config = spa_config_generate(spa, NULL, -1ULL,
+ B_TRUE);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ }
+
+ loaded = B_TRUE;
+ }
+
+ spa_open_ref(spa, tag);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+
+ *spapp = spa;
+
+ if (config != NULL) {
+ spa_config_enter(spa, RW_READER);
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa);
+ }
+
+ /*
+ * If we just loaded the pool, resilver anything that's out of date.
+ */
+ if (loaded && (spa_mode & FWRITE))
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL));
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, config);
+
+ if (spa != NULL)
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
+{
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t txg = TXG_INITIAL;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+ spa = spa_add(pool);
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ spa_activate(spa);
+
+ spa->spa_uberblock.ub_txg = txg - 1;
+ spa->spa_ubsync = spa->spa_uberblock;
+
+ error = spa_vdev_add(spa, nvroot);
+
+ if (error) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ if (altroot != NULL) {
+ spa->spa_root = spa_strdup(altroot);
+ atomic_add_32(&spa_active_count, 1);
+ }
+
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ VERIFY(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+
+ /*
+ * Create the deferred-free bplist object. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+ 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ ZIO_COMPRESS_OFF, tx);
+
+ VERIFY(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * We explicitly wait for the first transaction to complete so that our
+ * bean counters are appropriately updated.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import the given pool into the system. We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, char *altroot)
+{
+ spa_t *spa;
+ int error;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Create an initialize the spa structure
+ */
+ spa = spa_add(pool);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig
+ * so that we don't try to open the pool if the config is damaged.
+ */
+ error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+
+ if (error) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot != NULL) {
+ atomic_add_32(&spa_active_count, 1);
+ spa->spa_root = spa_strdup(altroot);
+ }
+
+ /*
+ * Initialize the config based on the in-core state.
+ */
+ config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0);
+
+ spa_config_set(spa, config);
+
+ /*
+ * Sync the configuration cache.
+ */
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Resilver anything that's out of date.
+ */
+ if (spa_mode & FWRITE)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname;
+ spa_t *spa;
+ uint64_t state;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME);
+
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ /*
+ * Initialize the spa_t structure.
+ */
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig
+ * so we don't try to open the pool if the config is damaged.
+ */
+ (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state)
+{
+ spa_t *spa;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * The pool will be in core if it's openable,
+ * in which case we can modify its state.
+ */
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+ /*
+ * Objsets may be open only because they're dirty, so we
+ * have to force it to sync before checking spa_refcnt.
+ */
+ spa_scrub_suspend(spa);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ if (!spa_refcount_zero(spa)) {
+ spa_scrub_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Update the pool state.
+ */
+ spa->spa_state = new_state;
+
+ spa_scrub_resume(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+ if (spa->spa_root != NULL)
+ atomic_add_32(&spa_active_count, -1);
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ spa_remove(spa);
+ spa_config_sync();
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add capacity to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg;
+ int c, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+
+ txg = spa_vdev_enter(spa);
+
+ vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+ if (rvd == NULL) /* spa_create() */
+ spa->spa_root_vdev = rvd = vd;
+
+ if ((error = vdev_create(vd, txg)) != 0)
+ return (spa_vdev_exit(spa, vd, txg, error));
+
+ /*
+ * Transfer each top-level vdev from the temporary root
+ * to the spa's root and initialize its metaslabs.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *tvd = vd->vdev_child[c];
+ if (vd != rvd) {
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = rvd->vdev_children;
+ vdev_add_child(rvd, tvd);
+ }
+ vdev_init(tvd, txg);
+ vdev_config_dirty(tvd);
+ }
+
+ /*
+ * Update the config based on the new in-core state.
+ */
+ spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+ return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally idendical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+{
+ uint64_t txg, open_txg;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops;
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = vdev_lookup_by_path(rvd, path);
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ pvd = oldvd->vdev_parent;
+
+ /*
+ * The parent must be a mirror or the root, unless we're replacing;
+ * in that case, the parent can be anything but another replacing vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops &&
+ (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ if (newrootvd == NULL || newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ if (newvd->vdev_psize < oldvd->vdev_psize)
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) sprintf(oldvd->vdev_path, "%s/%s",
+ newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /*
+ * If the parent is not a mirror, or if we're replacing,
+ * insert the new mirror/replacing vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ vdev_add_child(pvd, newvd);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Update the config based on the new in-core state.
+ */
+ spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
+ * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+ open_txg - TXG_INITIAL + 1);
+ mutex_exit(&newvd->vdev_dtl_lock);
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, txg);
+ (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
+
+ dprintf("attached %s, replacing=%d\n", path, replacing);
+
+ (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+ /*
+ * Kick off a resilver to update newvd.
+ */
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+{
+ uint64_t txg;
+ int c, t, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+
+ txg = spa_vdev_enter(spa);
+
+ vd = vdev_lookup_by_path(rvd, path);
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (guid != 0 && vd->vdev_guid != guid)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If replace_done is specified, only remove this device if it's
+ * the first child of a replacing vdev.
+ */
+ if (replace_done &&
+ (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * Only mirror and replacing vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If there's only one replica, you can't detach it.
+ */
+ if (pvd->vdev_children <= 1)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If all siblings have non-empty DTLs, this device may have the only
+ * valid copy of the data, which means we cannot safely detach it.
+ *
+ * XXX -- as in the vdev_offline() case, we really want a more
+ * precise DTL check.
+ */
+ for (c = 0; c < pvd->vdev_children; c++) {
+ uint64_t dirty;
+
+ cvd = pvd->vdev_child[c];
+ if (cvd == vd)
+ continue;
+ if (vdev_is_dead(cvd))
+ continue;
+ mutex_enter(&cvd->vdev_dtl_lock);
+ dirty = cvd->vdev_dtl_map.sm_space |
+ cvd->vdev_dtl_scrub.sm_space;
+ mutex_exit(&cvd->vdev_dtl_lock);
+ if (!dirty)
+ break;
+ }
+ if (c == pvd->vdev_children)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0);
+ if (error)
+ dprintf("unable to erase labels on %s\n", vdev_description(vd));
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[0];
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1)
+ vdev_remove_parent(cvd);
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reopen this top-level vdev to reassess health after detach.
+ */
+ vdev_reopen(tvd, NULL);
+
+ /*
+ * If the device we just detached was smaller than the others,
+ * it may be possible to add metaslabs (i.e. grow the pool).
+ */
+ vdev_metaslab_init(tvd, txg);
+
+ /*
+ * Update the config based on the new in-core state.
+ */
+ spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg.
+ * vdev_dtl_sync() will see that vd->vdev_detached is set
+ * and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list,
+ * to prevent vd from being accessed after it's freed.
+ */
+ vdev_dirty(tvd, VDD_DTL, txg);
+ vd->vdev_detached = B_TRUE;
+ for (t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+
+ dprintf("detached %s\n", path);
+
+ return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * If there are any replacing vdevs that have finished replacing, detach them.
+ * We can't hold the config lock across detaches, so we lock the config,
+ * build a list of candidates, unlock the config, and try each candidate.
+ */
+typedef struct vdev_detach_link {
+ char *vdl_path;
+ uint64_t vdl_guid;
+ list_node_t vdl_node;
+} vdev_detach_link_t;
+
+static void
+spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+
+ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ vdev_t *cvd0 = vd->vdev_child[0];
+ vdev_t *cvd1 = vd->vdev_child[1];
+ vdev_detach_link_t *vdl;
+ int dirty1;
+
+ mutex_enter(&cvd1->vdev_dtl_lock);
+ dirty1 = cvd1->vdev_dtl_map.sm_space |
+ cvd1->vdev_dtl_scrub.sm_space;
+ mutex_exit(&cvd1->vdev_dtl_lock);
+
+ if (!dirty1) {
+ vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
+ vdl->vdl_path = spa_strdup(cvd0->vdev_path);
+ vdl->vdl_guid = cvd0->vdev_guid;
+ list_insert_tail(l, vdl);
+ }
+ }
+}
+
+void
+spa_vdev_replace_done(spa_t *spa)
+{
+ vdev_detach_link_t *vdl;
+ list_t vdlist;
+
+ list_create(&vdlist, sizeof (vdev_detach_link_t),
+ offsetof(vdev_detach_link_t, vdl_node));
+
+ spa_config_enter(spa, RW_READER);
+ spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
+ spa_config_exit(spa);
+
+ while ((vdl = list_head(&vdlist)) != NULL) {
+ list_remove(&vdlist, vdl);
+ (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
+ B_TRUE);
+ spa_strfree(vdl->vdl_path);
+ kmem_free(vdl, sizeof (*vdl));
+ }
+
+ list_destroy(&vdlist);
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ if (zio->io_error)
+ spa->spa_scrub_errors++;
+ if (--spa->spa_scrub_inflight == 0)
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+
+ if (zio->io_error) {
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+{
+ size_t size = BP_GET_LSIZE(bp);
+ void *data = zio_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ spa_scrub_io_done, NULL, priority, flags));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+ blkptr_t *bp = &bc->bc_blkptr;
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+
+ if (bc->bc_errno || vd == NULL) {
+ /*
+ * We can't scrub this block, but we can continue to scrub
+ * the rest of the pool. Note the error and move along.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ if (vd != NULL) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ return (ERESTART);
+ }
+
+ ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+ if (DVA_GET_GANG(&bp->blk_dva[0])) {
+ /*
+ * Gang members may be spread across multiple vdevs,
+ * so the best we can do is look at the pool-wide DTL.
+ * XXX -- it would be better to change our allocation
+ * policy to ensure that this can't happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_RESILVER);
+ }
+ } else {
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+ }
+
+ return (0);
+}
+
+static void
+spa_scrub_thread(spa_t *spa)
+{
+ callb_cpr_t cprinfo;
+ traverse_handle_t *th = spa->spa_scrub_th;
+ vdev_t *rvd = spa->spa_root_vdev;
+ pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+ int error = 0;
+ boolean_t complete;
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+ spa_config_enter(spa, RW_WRITER);
+ vdev_reopen(rvd, NULL); /* purge all vdev caches */
+ vdev_config_dirty(rvd); /* rewrite all disk labels */
+ vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+ spa_config_exit(spa);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors = 0;
+ spa->spa_scrub_active = 1;
+
+ while (!spa->spa_scrub_stop) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_scrub_suspend) {
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_active = 1;
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+ if (spa->spa_scrub_restart_txg != 0)
+ break;
+
+ mutex_exit(&spa->spa_scrub_lock);
+ error = traverse_more(th);
+ mutex_enter(&spa->spa_scrub_lock);
+ if (error != EAGAIN)
+ break;
+ }
+
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+ if (spa->spa_scrub_restart_txg != 0)
+ error = ERESTART;
+
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+
+ /*
+ * If the traverse completed, and there were no errors,
+ * then the scrub was completely successful.
+ */
+ complete = (error == 0 && spa->spa_scrub_errors == 0);
+
+ dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+ error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to reflect this.
+ * Whether it succeeded or not, vacate all temporary scrub DTLs.
+ */
+ spa_config_enter(spa, RW_WRITER);
+ vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+ complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+ spa_config_exit(spa);
+
+ spa_vdev_replace_done(spa);
+
+ spa_config_enter(spa, RW_READER);
+ vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+ spa_config_exit(spa);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ spa->spa_scrub_type = POOL_SCRUB_NONE;
+ spa->spa_scrub_active = 0;
+ spa->spa_scrub_thread = NULL;
+
+ cv_broadcast(&spa->spa_scrub_cv);
+
+ /*
+ * If we were told to restart, our final act is to start a new scrub.
+ */
+ if (error == ERESTART)
+ VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+
+ CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
+ thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_suspend++;
+ while (spa->spa_scrub_active) {
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT(spa->spa_scrub_suspend != 0);
+ if (--spa->spa_scrub_suspend == 0)
+ cv_broadcast(&spa->spa_scrub_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+ /*
+ * Something happened (e.g. snapshot create/delete) that means
+ * we must restart any in-progress scrubs. The itinerary will
+ * fix this properly.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_restart_txg = txg;
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+ space_seg_t *ss;
+ uint64_t mintxg, maxtxg;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int advance = 0;
+
+ if ((uint_t)type >= POOL_SCRUB_TYPES)
+ return (ENOTSUP);
+
+ /*
+ * If there's a scrub or resilver already in progress, stop it.
+ */
+ while (spa->spa_scrub_thread != NULL) {
+ /*
+ * Don't stop a resilver unless forced.
+ */
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+ return (EBUSY);
+
+ spa->spa_scrub_stop = 1;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+
+ /*
+ * Terminate the previous traverse.
+ */
+ if (spa->spa_scrub_th != NULL) {
+ traverse_fini(spa->spa_scrub_th);
+ spa->spa_scrub_th = NULL;
+ }
+
+ spa->spa_scrub_stop = 0;
+ spa->spa_scrub_type = type;
+ spa->spa_scrub_restart_txg = 0;
+
+ mintxg = TXG_INITIAL - 1;
+ maxtxg = spa_last_synced_txg(spa) + 1;
+
+ switch (type) {
+
+ case POOL_SCRUB_NONE:
+ break;
+
+ case POOL_SCRUB_RESILVER:
+ /*
+ * Determine the resilvering boundaries.
+ *
+ * Note: (mintxg, maxtxg) is an open interval,
+ * i.e. mintxg and maxtxg themselves are not included.
+ *
+ * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+ * so we don't claim to resilver a txg that's still changing.
+ */
+ mutex_enter(&rvd->vdev_dtl_lock);
+ ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+ mintxg = ss ? ss->ss_start - 1 : 0;
+ ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+ maxtxg = ss ? ss->ss_end : 0;
+ maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
+ mutex_exit(&rvd->vdev_dtl_lock);
+
+ advance = ADVANCE_PRE | ADVANCE_PRUNE;
+ break;
+
+ case POOL_SCRUB_EVERYTHING:
+ /*
+ * A scrub is like a resilver, but not pruned by DTL.
+ */
+ advance = ADVANCE_PRE;
+ break;
+ }
+
+ if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+ spa->spa_scrub_maxtxg = maxtxg;
+ spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+ advance, ZIO_FLAG_CANFAIL);
+ traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+ spa->spa_scrub_thread = thread_create(NULL, 0,
+ spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ return (0);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+ int error;
+ traverse_handle_t *th;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ error = spa_scrub_locked(spa, type, force);
+ th = spa->spa_scrub_th;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ if (th == NULL && type != POOL_SCRUB_NONE)
+ spa_vdev_replace_done(spa);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ dmu_tx_t *tx;
+ blkptr_t blk;
+ uint64_t itor = 0;
+ zio_t *zio;
+ int error;
+ uint8_t c = 1;
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+ while (bplist_iterate(bpl, &itor, &blk) == 0)
+ zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+ error = zio_wait(zio);
+ ASSERT3U(error, ==, 0);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ bplist_vacate(bpl, tx);
+
+ /*
+ * Pre-dirty the first block so we sync to convergence faster.
+ * (Usually only the first block is needed.)
+ */
+ dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ if (list_is_empty(&spa->spa_dirty_list))
+ return;
+
+ config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+ spa_config_set(spa, config);
+
+ VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+
+ dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
+ packed, tx);
+
+ kmem_free(packed, nvsize);
+
+ db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db);
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ objset_t *mos = spa->spa_meta_objset;
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ dmu_tx_t *tx;
+ int dirty_vdevs;
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+
+ /*
+ * If anything has changed in this txg, push the deferred frees
+ * from the previous txg. If not, leave them alone so that we
+ * don't generate work on an otherwise idle system.
+ */
+ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+ !txg_list_empty(&dp->dp_dirty_dirs, txg))
+ spa_sync_deferred_frees(spa, txg);
+
+ /*
+ * Iterate to convergence.
+ */
+ do {
+ spa->spa_sync_pass++;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+ spa_sync_config_object(spa, tx);
+ dmu_tx_commit(tx);
+
+ dsl_pool_sync(dp, txg);
+
+ dirty_vdevs = 0;
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+ vdev_sync(vd, txg);
+ dirty_vdevs++;
+ }
+
+ tx = dmu_tx_create_assigned(dp, txg);
+ bplist_sync(bpl, tx);
+ dmu_tx_commit(tx);
+
+ } while (dirty_vdevs);
+
+ bplist_close(bpl);
+
+ dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+ /*
+ * Rewrite the vdev configuration (which includes the uberblock)
+ * to commit the transaction group.
+ */
+ while (spa_sync_labels(spa, txg)) {
+ dprintf("waiting for devices to heal\n");
+ delay(hz);
+ vdev_reopen(rvd, NULL);
+ }
+
+ /*
+ * Make a stable copy of the fully synced uberblock.
+ * We use this as the root for pool traversals.
+ */
+ spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */
+
+ spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */
+
+ rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+ spa->spa_traverse_wanted = 0;
+ spa->spa_ubsync = spa->spa_uberblock;
+ rw_exit(&spa->spa_traverse_lock);
+
+ spa_scrub_resume(spa); /* resume scrub with new ubsync */
+
+ /*
+ * Clean up the ZIL records for the synced txg.
+ */
+ dsl_pool_zil_clean(dp);
+
+ /*
+ * Update usable space statistics.
+ */
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ vdev_sync_done(vd, txg);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since spa_sync_labels().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+ ASSERT(bpl->bpl_queue == NULL);
+
+ spa_config_exit(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE)
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+int
+spa_busy(void)
+{
+ return (spa_active_count != 0);
+}
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop all scrub and resilver activity. spa_scrub() needs to
+ * wait for the scrub thread, which may do a detach and sync the
+ * configs, which needs spa_namespace_lock. Drop the lock while
+ * maintaining a hold on the spa_t.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 0000000000..abcd67ddb9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,308 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+
+/*
+ * Pool configuration repository.
+ *
+ * The configuration for all pools, in addition to being stored on disk, is
+ * stored in /kernel/drv/zpool.cache as a packed nvlist. The kernel maintains
+ * this list as pools are created, destroyed, or modified.
+ *
+ * We have a single nvlist which holds all the configuration information. When
+ * the module loads, we read this information from the cache and populate the
+ * SPA namespace. This namespace is maintained independently in spa.c.
+ * Whenever the namespace is modified, or the configuration of a pool is
+ * changed, we call spa_config_sync(), which walks through all the active pools
+ * and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ vnode_t *vp;
+ void *buf = NULL;
+ vattr_t vattr;
+ ssize_t resid;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ spa_t *spa;
+ char pathname[128];
+
+ /*
+ * Open the configuration file.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
+ ZPOOL_CACHE_FILE);
+ if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
+ rootdir) != 0)
+ return;
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+ goto out;
+
+ buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+
+ if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid) != 0)
+ goto out;
+
+ if (resid != 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ spa = spa_add(nvpair_name(nvpair));
+
+ /*
+ * We blindly duplicate the configuration here. If it's
+ * invalid, we will catch it when the pool is first opened.
+ */
+ VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, vattr.va_size);
+
+ (void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
+ VN_RELE(vp);
+}
+
+/*
+ * Synchronize all pools to disk. This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+ spa_t *spa = NULL;
+ nvlist_t *config;
+ size_t buflen;
+ char *buf;
+ vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char pathname[128];
+ char pathname2[128];
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+ /*
+ * Add all known pools to the configuration list, ignoring those with
+ * alternate root paths.
+ */
+ spa = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
+ VERIFY(nvlist_add_nvlist(config, spa->spa_name,
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+
+ /*
+ * Write the configuration to disk. We need to do the traditional
+ * 'write to temporary file, sync, move over original' to make sure we
+ * always have a consistent view of the data.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
+ ZPOOL_CACHE_TMP);
+
+ if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+ goto out;
+
+ if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+ VOP_FSYNC(vp, FSYNC, kcred) == 0) {
+ (void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+ spa_config_dir, ZPOOL_CACHE_FILE);
+ (void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+ }
+
+ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
+ VN_RELE(vp);
+
+out:
+ (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+ spa_config_generation++;
+
+ kmem_free(buf, buflen);
+ nvlist_free(config);
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /kernel/drv/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ *generation = spa_config_generation;
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config != NULL)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (vd == NULL)
+ vd = rvd;
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ * If txg is any other non-zero value, update spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+ else if (txg != 0 && vd == rvd)
+ spa->spa_config_txg = txg;
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ UBERBLOCK_VERSION) == 0);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ spa_name(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ spa_state(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_guid(spa)) == 0);
+
+ if (vd != rvd) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ vd = vd->vdev_top; /* label contains top config */
+ }
+
+ nvroot = vdev_config_generate(vd, getstats);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+
+ return (config);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 0000000000..c1b6017509
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against SPA_MINREF, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa crazy rwlock)
+ *
+ * This SPA special is a recursive rwlock, capable of being acquired from
+ * asynchronous threads. It has protects the spa_t from config changes,
+ * and must be held in the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * This mutex prevents the spa_config nvlist from being updated. No
+ * other locks are required to obtain this lock, although implicitly you
+ * must have the namespace lock or non-zero refcount to have any kind
+ * of spa_t pointer at all.
+ *
+ * spa_vdev_lock (global mutex)
+ *
+ * This special lock is a global mutex used to serialize attempts to
+ * access devices through ZFS. It makes sure that we do not try to add
+ * a single vdev to multiple pools at the same time. It must be held
+ * when adding or removing a device from the pool.
+ *
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_vdev_lock -> spa_config_lock
+ *
+ * There are no locks required for spa_vdev_lock, but it must be
+ * acquired before spa_config_lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock is manipulated using the following functions:
+ *
+ * spa_config_enter() Acquire the config lock as RW_READER or
+ * RW_WRITER. At least one reference on the spa_t
+ * must exist.
+ *
+ * spa_config_exit() Release the config lock.
+ *
+ * spa_config_held() Returns true if the config lock is currently
+ * held in the given state.
+ *
+ * The spa_vdev_lock, while acquired directly, is hidden by the following
+ * functions, which imply additional semantics that must be followed:
+ *
+ * spa_vdev_enter() Acquire the vdev lock and the config lock for
+ * writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, release the vdev lock, and sync
+ * the updated configs to the cache.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename. spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+int zfs_flags = ~0;
+#else
+int zfs_flags = 0;
+#endif
+
+static kmutex_t spa_vdev_lock;
+
+#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ spa_t search, *spa;
+ avl_index_t where;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ search.spa_name = (char *)name;
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name)
+{
+ spa_t *spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ spa->spa_name = spa_strdup(name);
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+
+ refcount_create(&spa->spa_refcount);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT(spa->spa_scrub_thread == NULL);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root)
+ spa_strfree(spa->spa_root);
+
+ if (spa->spa_name)
+ spa_strfree(spa->spa_name);
+
+ spa_config_set(spa, NULL);
+
+ refcount_destroy(&spa->spa_refcount);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+
+/*
+ * Acquire the config lock. The config lock is a special rwlock that allows for
+ * recursive enters. Because these enters come from the same thread as well as
+ * asynchronous threads working on behalf of the owner, we must unilaterally
+ * allow all reads access as long at least one reader is held (even if a write
+ * is requested). This has the side effect of write starvation, but write locks
+ * are extremely rare, and a solution to this problem would be significantly
+ * more complex (if even possible).
+ *
+ * We would like to assert that the namespace lock isn't held, but this is a
+ * valid use during create.
+ */
+void
+spa_config_enter(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (scl->scl_writer != curthread) {
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (scl->scl_writer != NULL || scl->scl_count > 0)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+ }
+
+ scl->scl_count++;
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Release the spa config lock, notifying any waiters in the process.
+ */
+void
+spa_config_exit(spa_t *spa)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(scl->scl_count > 0);
+ if (--scl->scl_count == 0) {
+ cv_broadcast(&scl->scl_cv);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Returns true if the config lock is held in the given manner.
+ */
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+ boolean_t held;
+
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_WRITER)
+ held = (scl->scl_writer == curthread);
+ else
+ held = (scl->scl_count != 0);
+ mutex_exit(&scl->scl_lock);
+
+ return (held);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev. This
+ * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ mutex_enter(&spa_vdev_lock);
+
+ spa_config_enter(spa, RW_WRITER);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ spa_config_exit(spa);
+
+ if (vd == spa->spa_root_vdev) { /* spa_create() */
+ mutex_exit(&spa_vdev_lock);
+ return (error);
+ }
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ mutex_exit(&spa_vdev_lock);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+ vdev_free(vd);
+ }
+
+ /*
+ * If we're in the middle of export or destroy, don't sync the
+ * config -- it will do that anyway, and we deadlock if we try.
+ */
+ if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_sync();
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+ spa_t *spa;
+ int err;
+
+ /*
+ * Lookup the spa_t and grab the config lock for writing. We need to
+ * actually open the pool so that we can sync out the necessary labels.
+ * It's OK to call spa_open() with the namespace lock held because we
+ * alllow recursive calls for other reasons.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((err = spa_open(name, &spa, FTAG)) != 0) {
+ mutex_exit(&spa_namespace_lock);
+ return (err);
+ }
+
+ spa_config_enter(spa, RW_WRITER);
+
+ avl_remove(&spa_namespace_avl, spa);
+ spa_strfree(spa->spa_name);
+ spa->spa_name = spa_strdup(newname);
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Sync all labels to disk with the new names by marking the root vdev
+ * dirty and waiting for it to sync. It will pick up the new pool name
+ * during the sync.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa);
+
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * Sync the updated config cache.
+ */
+ spa_config_set(spa,
+ spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0));
+ spa_config_sync();
+
+ spa_close(spa, FTAG);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists. If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+ boolean_t locked = B_FALSE;
+
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid && (device_guid == 0 ||
+ vdev_lookup_by_guid(spa->spa_root_vdev, device_guid)))
+ break;
+ }
+
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, blkptr_t *bp)
+{
+ /* XXBP - Need to see if we want all DVAs or not */
+ dva_t *dva = BP_IDENTITY(bp);
+
+ if (bp == NULL) {
+ (void) sprintf(buf, "<NULL>");
+ return;
+ }
+
+ if (BP_IS_HOLE(bp)) {
+ (void) sprintf(buf, "<hole>");
+ return;
+ }
+
+ (void) sprintf(buf, "[L%llu %s] vdev=%llu offset=%llx "
+ "size=%llxL/%llxP/%llxA %s %s %s %s",
+ (u_longlong_t)BP_GET_LEVEL(bp),
+ dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ (u_longlong_t)DVA_GET_VDEV(dva),
+ (u_longlong_t)DVA_GET_OFFSET(dva),
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp),
+ (u_longlong_t)DVA_GET_ASIZE(dva),
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+ DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang");
+
+ (void) sprintf(buf + strlen(buf), " birth=%llu fill=%llu"
+ " cksum=%llx:%llx:%llx:%llx",
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, RW_WRITER);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+ return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+ return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ /*
+ * Accessing the name requires holding either the namespace lock or the
+ * config lock, both of which are required to do a rename.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/*
+ * In the future, this may select among different metaslab classes
+ * depending on the zdp. For now, there's no such distinction.
+ */
+metaslab_class_t *
+spa_metaslab_class_select(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+ /*
+ * For now, the worst case is 512-byte RAID-Z blocks, in which
+ * case the space requirement is exactly 2x; so just assume that.
+ */
+ return (lsize << 1);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+ if (s > 0)
+ return (1);
+ if (s < 0)
+ return (-1);
+ return (0);
+}
+
+void
+spa_init(int mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ spa_mode = mode;
+
+ refcount_init();
+ unique_init();
+ zio_init();
+ dmu_init();
+ zil_init();
+ spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+ spa_evict_all();
+
+ zil_fini();
+ dmu_fini();
+ zio_fini();
+ refcount_fini();
+
+ avl_destroy(&spa_namespace_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
new file mode 100644
index 0000000000..25f66bf94b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+
+ if (s1->ss_start < s2->ss_start) {
+ if (s1->ss_end > s2->ss_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ss_start > s2->ss_start) {
+ if (s1->ss_start < s2->ss_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift,
+ kmutex_t *lp)
+{
+ avl_create(&sm->sm_root, space_map_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+ sm->sm_start = start;
+ sm->sm_end = start + size;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_space = 0;
+ sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+ VERIFY3U(sm->sm_space, ==, 0);
+ avl_destroy(&sm->sm_root);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss_before, *ss_after, *ss;
+ uint64_t end = start + size;
+ int merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY3U(start, >=, sm->sm_start);
+ VERIFY3U(end, <=, sm->sm_end);
+ VERIFY(sm->sm_space + size <= sm->sm_size);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ /* Make sure we don't overlap with either of our neighbors */
+ VERIFY(ss == NULL);
+
+ ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+ ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+ merge_before = (ss_before != NULL && ss_before->ss_end == start);
+ merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+ if (merge_before && merge_after) {
+ avl_remove(&sm->sm_root, ss_before);
+ ss_after->ss_start = ss_before->ss_start;
+ kmem_free(ss_before, sizeof (*ss_before));
+ } else if (merge_before) {
+ ss_before->ss_end = end;
+ } else if (merge_after) {
+ ss_after->ss_start = start;
+ } else {
+ ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+ ss->ss_start = start;
+ ss->ss_end = end;
+ avl_insert(&sm->sm_root, ss, where);
+ }
+
+ sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss, *newseg;
+ uint64_t end = start + size;
+ int left_over, right_over;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ VERIFY(ss != NULL);
+ VERIFY3U(ss->ss_start, <=, start);
+ VERIFY3U(ss->ss_end, >=, end);
+ VERIFY(sm->sm_space - size <= sm->sm_size);
+
+ left_over = (ss->ss_start != start);
+ right_over = (ss->ss_end != end);
+
+ if (left_over && right_over) {
+ newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+ newseg->ss_start = end;
+ newseg->ss_end = ss->ss_end;
+ ss->ss_end = start;
+ avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ } else if (left_over) {
+ ss->ss_end = start;
+ } else if (right_over) {
+ ss->ss_start = end;
+ } else {
+ avl_remove(&sm->sm_root, ss);
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss;
+ uint64_t end = start + size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ if (func != NULL)
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+ kmem_free(ss, sizeof (*ss));
+ }
+ sm->sm_space = 0;
+}
+
+void
+space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_merge(space_map_t *src, space_map_t *dest)
+{
+ space_map_vacate(src, space_map_add, dest);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, search;
+ uint64_t end = start + size;
+ uint64_t rm_start, rm_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ search.ss_start = start;
+ search.ss_end = start;
+
+ for (;;) {
+ ss = avl_find(t, &search, &where);
+
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ if (ss == NULL || ss->ss_start >= end)
+ break;
+
+ rm_start = MAX(ss->ss_start, start);
+ rm_end = MIN(ss->ss_end, end);
+
+ space_map_remove(sm, rm_start, rm_end - rm_start);
+ }
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+ avl_tree_t *t = &sms->sm_root;
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(smd->sm_lock));
+
+ /*
+ * For each source segment, remove any intersections with the
+ * destination, then add the source segment to the destination.
+ */
+ for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+ space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ }
+}
+
+int
+space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
+ objset_t *os, uint64_t end, uint64_t space)
+{
+ uint64_t *entry, *entry_map, *entry_map_end;
+ uint64_t bufsize, size, offset;
+ uint64_t mapstart = sm->sm_start;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY3U(sm->sm_space, ==, 0);
+
+ bufsize = MIN(end, SPACE_MAP_CHUNKSIZE);
+ entry_map = kmem_alloc(bufsize, KM_SLEEP);
+
+ if (maptype == SM_FREE) {
+ space_map_add(sm, sm->sm_start, sm->sm_size);
+ space = sm->sm_size - space;
+ }
+
+ for (offset = 0; offset < end; offset += bufsize) {
+ size = MIN(end - offset, bufsize);
+ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+ VERIFY(size != 0);
+
+ dprintf("object=%llu offset=%llx size=%llx\n",
+ smo->smo_object, offset, size);
+ dmu_read(os, smo->smo_object, offset, size, entry_map);
+
+ entry_map_end = entry_map + (size / sizeof (uint64_t));
+ for (entry = entry_map; entry < entry_map_end; entry++) {
+ uint64_t e = *entry;
+
+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
+ continue;
+
+ (SM_TYPE_DECODE(e) == maptype ?
+ space_map_add : space_map_remove)(sm,
+ (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+ SM_RUN_DECODE(e) << sm->sm_shift);
+ }
+ }
+ VERIFY3U(sm->sm_space, ==, space);
+
+ kmem_free(entry_map, bufsize);
+
+ return (0);
+}
+
+void
+space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo,
+ uint8_t maptype, objset_t *os, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ void *cookie = NULL;
+ space_seg_t *ss;
+ uint64_t bufsize, start, size, run_len;
+ uint64_t *entry, *entry_map, *entry_map_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_space == 0)
+ return;
+
+ dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+ smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+ sm->sm_space);
+
+ bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+ bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE);
+ entry_map = kmem_alloc(bufsize, KM_SLEEP);
+ entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+ entry = entry_map;
+
+ *entry++ = SM_DEBUG_ENCODE(1) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ size = ss->ss_end - ss->ss_start;
+ start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+ if (dest)
+ space_map_add(dest, ss->ss_start, size);
+
+ sm->sm_space -= size;
+ size >>= sm->sm_shift;
+
+ while (size) {
+ run_len = MIN(size, SM_RUN_MAX);
+
+ if (entry == entry_map_end) {
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ bufsize, entry_map, tx);
+ smo->smo_objsize += bufsize;
+ entry = entry_map;
+ }
+
+ *entry++ = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+
+ start += run_len;
+ size -= run_len;
+ }
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ if (entry != entry_map) {
+ size = (entry - entry_map) * sizeof (uint64_t);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ size, entry_map, tx);
+ smo->smo_objsize += size;
+ }
+
+ kmem_free(entry_map, bufsize);
+
+ VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
+ dmu_tx_t *tx)
+{
+ uint64_t oldsize = smo->smo_objsize;
+
+ dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+
+ smo->smo_objsize = 0;
+
+ VERIFY3U(sm->sm_space, ==, smo->smo_alloc);
+ space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx);
+
+ dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n",
+ smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx));
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 0000000000..b11cd42b6d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ARC_H
+#define _SYS_ARC_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+ arc_buf_hdr_t *b_hdr;
+ arc_buf_t *b_next;
+ void *b_data;
+};
+
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
+#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
+#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
+
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
+void arc_buf_free(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t arc_flags);
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t arc_flags);
+int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags);
+int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+
+void arc_flush(void);
+void arc_tempreserve_clear(uint64_t tempreserve);
+int arc_tempreserve_space(uint64_t tempreserve);
+
+void arc_init(void);
+void arc_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 0000000000..0933cb977b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_BPLIST_H
+#define _SYS_BPLIST_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpl_entries blkptr_t's, representing
+ * a total of bpl_bytes physical space.
+ */
+ uint64_t bpl_entries;
+ uint64_t bpl_bytes;
+} bplist_phys_t;
+
+typedef struct bplist_q {
+ blkptr_t bpq_blk;
+ void *bpq_next;
+} bplist_q_t;
+
+typedef struct bplist {
+ kmutex_t bpl_lock;
+ objset_t *bpl_mos;
+ uint64_t bpl_object;
+ int bpl_blockshift;
+ int bpl_bpshift;
+ bplist_q_t *bpl_queue;
+ bplist_phys_t *bpl_phys;
+ dmu_buf_t *bpl_dbuf;
+ dmu_buf_t *bpl_cached_dbuf;
+} bplist_t;
+
+extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
+extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
+extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern void bplist_close(bplist_t *bpl);
+extern boolean_t bplist_empty(bplist_t *bpl);
+extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
+extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 0000000000..3cf45f5985
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,302 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DBUF_H
+#define _SYS_DBUF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DB_BONUS_BLKID (-1ULL)
+#define IN_DMU_SYNC ((blkptr_t *)-1)
+
+/*
+ * define flags for dbuf_read and friends
+ */
+
+#define DB_RF_MUST_SUCCEED 0
+#define DB_RF_CANFAIL (1 << 1)
+#define DB_RF_HAVESTRUCT (1 << 2)
+#define DB_RF_NOPREFETCH (1 << 3)
+
+/*
+ * The state transition diagram for dbufs looks like:
+ *
+ * +----> READ ----+
+ * | |
+ * | V
+ * (alloc)-->UNCACHED CACHED-->(free)
+ * | ^
+ * | |
+ * +----> FILL ----+
+ */
+typedef enum dbuf_states {
+ DB_UNCACHED,
+ DB_FILL,
+ DB_READ,
+ DB_CACHED
+} dbuf_states_t;
+
+struct objset_impl;
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+#define LIST_LINK_INACTIVE(link) \
+ ((link)->list_next == NULL && (link)->list_prev == NULL)
+
+typedef struct dmu_buf_impl {
+ /*
+ * The following members are immutable, with the exception of
+ * db.db_data, which is protected by db_mtx.
+ */
+
+ /* the publicly visible structure */
+ dmu_buf_t db;
+
+ /* the objset we belong to */
+ struct objset_impl *db_objset;
+
+ /*
+ * the dnode we belong to (NULL when evicted)
+ */
+ struct dnode *db_dnode;
+
+ /*
+ * our parent buffer; if the dnode points to us directly,
+ * db_parent == db_dnode->dn_dbuf
+ * only accessed by sync thread ???
+ * (NULL when evicted)
+ */
+ struct dmu_buf_impl *db_parent;
+
+ /*
+ * link for hash table of all dmu_buf_impl_t's
+ */
+ struct dmu_buf_impl *db_hash_next;
+
+ /* our block number */
+ uint64_t db_blkid;
+
+ /*
+ * Pointer to the blkptr_t which points to us. May be NULL if we
+ * don't have one yet. (NULL when evicted)
+ */
+ blkptr_t *db_blkptr;
+
+ /*
+ * Our indirection level. Data buffers have db_level==0.
+ * Indirect buffers which point to data buffers have
+ * db_level==1. etc. Buffers which contain dnodes have
+ * db_level==0, since the dnodes are stored in a file.
+ */
+ uint8_t db_level;
+
+ /* db_mtx protects the members below */
+ kmutex_t db_mtx;
+
+ /*
+ * Current state of the buffer
+ */
+ dbuf_states_t db_state;
+
+ /*
+ * Refcount accessed by dmu_buf_{hold,rele}.
+ * If nonzero, the buffer can't be destroyed.
+ * Protected by db_mtx.
+ */
+ refcount_t db_holds;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
+ kcondvar_t db_changed;
+ arc_buf_t *db_data_pending;
+
+ /*
+ * Last time (transaction group) this buffer was dirtied.
+ */
+ uint64_t db_dirtied;
+
+ /*
+ * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_mtx.
+ */
+ list_node_t db_link;
+
+ /* Our link on dn_dirty_dbufs[txg] */
+ list_node_t db_dirty_node[TXG_SIZE];
+ uint8_t db_dirtycnt;
+
+ /*
+ * Data which is unique to data (leaf) blocks:
+ */
+ struct {
+ /* stuff we store for the user (see dmu_buf_set_user) */
+ void *db_user_ptr;
+ void **db_user_data_ptr_ptr;
+ dmu_buf_evict_func_t *db_evict_func;
+ uint8_t db_immediate_evict;
+ uint8_t db_freed_in_flight;
+
+ /*
+ * db_data_old[txg&TXG_MASK] is set when we
+ * dirty the buffer, so that we can retain the
+ * pointer even if it gets COW'd in a subsequent
+ * transaction group.
+ *
+ * If the buffer is dirty in any txg, it can't
+ * be destroyed.
+ */
+ /*
+ * XXX Protected by db_mtx and dn_dirty_mtx.
+ * db_mtx must be held to read db_dirty[], and
+ * both db_mtx and dn_dirty_mtx must be held to
+ * modify (dirty or clean). db_mtx must be held
+ * before dn_dirty_mtx.
+ */
+ arc_buf_t *db_data_old[TXG_SIZE];
+ blkptr_t *db_overridden_by[TXG_SIZE];
+ } db_d;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define DBUF_MUTEXES 256
+#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+ uint64_t hash_table_mask;
+ dmu_buf_impl_t **hash_table;
+ kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+ void *tag);
+dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+ void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+void dbuf_read(dmu_buf_impl_t *db);
+int dbuf_read_canfail(dmu_buf_impl_t *db);
+void dbuf_read_havestruct(dmu_buf_impl_t *db);
+void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx);
+void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
+
+void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+ struct dmu_tx *);
+
+void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dbuf(dbuf, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dbuf)->db.db_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj); \
+ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+ "obj=%s lvl=%u blkid=%lld " fmt, \
+ __db_buf, (dbuf)->db_level, \
+ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __blkbuf[200]; \
+ sprintf_blkptr(__blkbuf, bp); \
+ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define dprintf_dbuf(db, fmt, ...)
+#define dprintf_dbuf_bp(db, bp, fmt, ...)
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 0000000000..f51ab89a90
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,635 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_H
+#define _SYS_DMU_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA. That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPLIST, /* UINT64 */
+ DMU_OT_BPLIST_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ DMU_OT_DSL_DATASET_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_OBJSET_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_OBJSET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_DELETE_QUEUE, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+
+ DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define DS_MODE_NONE 0 /* invalid, to aid debugging */
+#define DS_MODE_STANDARD 1 /* normal access, no special needs */
+#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */
+#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */
+#define DS_MODE_LEVELS 4
+#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1))
+#define DS_MODE_READONLY 0x8
+#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
+#define DS_MODE_RESTORE 0x10
+#define DS_MODE_IS_RESTORE(x) ((x) & DS_MODE_RESTORE)
+
+#define DS_FIND_SNAPSHOTS 0x01
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_rename(const char *name, const char *newname);
+void dmu_objset_set_quota(objset_t *os, uint64_t quota);
+uint64_t dmu_objset_get_quota(objset_t *os);
+int dmu_objset_request_reservation(objset_t *os, uint64_t reservation);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+ uint64_t db_object; /* object that this buffer is part of */
+ uint64_t db_offset; /* byte offset in this object */
+ uint64_t db_size; /* size of buffer in bytes */
+ void *db_data; /* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * Callback function to perform byte swapping on a block.
+ */
+typedef void dmu_byteswap_func_t(void *buf, size_t size);
+
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+
+/*
+ * Allocate an object from this objset. The range of object numbers
+ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg. The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number. If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out. It will be updated to be the next
+ * object which is allocated.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first. If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size. If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+ int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode. The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode. The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx);
+
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data. As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
+ * buffer as well. You must release your hold with dmu_buf_rele().
+ */
+dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
+dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_max(void);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory. You must release the hold with
+ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
+ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data. The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db);
+void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object. A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array. The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array. You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length, int *numbufs);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it. Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+ void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+void dmu_buf_hold_data(dmu_buf_t *db);
+void dmu_buf_rele_data(dmu_buf_t *db);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to read the buffer's data (db_data).
+ *
+ * This routine will read the data from disk if necessary.
+ *
+ * These routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_buf_read(dmu_buf_t *db);
+int dmu_buf_read_canfail(dmu_buf_t *db);
+void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
+int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * Indicate that you are going to modify the entire contents of the
+ * buffer's data ("fill" it).
+ *
+ * This routine is the same as dmu_buf_will_dirty, except that it won't
+ * read the contents off the disk, so the contents may be uninitialized
+ * and you must overwrite it.
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx (ie.
+ * you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction. Then you must assign
+ * the transaction to a transaction group. Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction. You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned. You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define DMU_NEW_OBJECT (-1ULL)
+#define DMU_OBJECT_END (-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * Free up the data blocks for a defined range of a file. If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf);
+int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
+ uint64_t size, void *buf);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ struct uio *uio, dmu_tx_t *tx);
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t len);
+
+typedef struct dmu_object_info {
+ /* All sizes are in bytes. */
+ uint32_t doi_data_block_size;
+ uint32_t doi_metadata_block_size;
+ uint64_t doi_bonus_size;
+ dmu_object_type_t doi_type;
+ dmu_object_type_t doi_bonus_type;
+ uint8_t doi_indirection; /* 2 = dnode->indirect->data */
+ uint8_t doi_checksum;
+ uint8_t doi_compress;
+ uint8_t doi_pad[5];
+ /* Values below are number of 512-byte blocks. */
+ uint64_t doi_physical_blks; /* data + metadata */
+ uint64_t doi_max_block_offset;
+} dmu_object_info_t;
+
+typedef struct dmu_object_type_info {
+ dmu_byteswap_func_t *ot_byteswap;
+ boolean_t ot_metadata;
+ char *ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+ u_longlong_t *nblk512);
+
+/*
+ * Get the maximum nonzero offset in the object (ie. this offset and all
+ * offsets following are zero).
+ *
+ * XXX Perhaps integrate this with dmu_object_info(), although that
+ * would then have to bring in the indirect blocks.
+ */
+uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
+
+typedef struct dmu_objset_stats {
+ dmu_objset_type_t dds_type;
+ uint8_t dds_is_snapshot;
+ uint8_t dds_is_placeholder;
+ uint8_t dds_pad[2];
+
+ uint64_t dds_creation_time;
+ uint64_t dds_creation_txg;
+
+ char dds_clone_of[MAXNAMELEN];
+
+ /* How much data is there in this objset? */
+
+ /*
+ * Space referenced, taking into account pending writes and
+ * frees. Only relavent to filesystems and snapshots (not
+ * collections).
+ */
+ uint64_t dds_space_refd;
+
+ /*
+ * Space "used", taking into account pending writes and frees, and
+ * children's reservations (in bytes). This is the amount of
+ * space that will be freed if this and all dependent items are
+ * destroyed (eg. child datasets, objsets, and snapshots). So
+ * for snapshots, this is the amount of space unique to this
+ * snapshot.
+ */
+ uint64_t dds_space_used;
+
+ /*
+ * Compressed and uncompressed bytes consumed. Does not take
+ * into account reservations. Used for computing compression
+ * ratio.
+ */
+ uint64_t dds_compressed_bytes;
+ uint64_t dds_uncompressed_bytes;
+
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t dds_fsid_guid;
+ uint64_t dds_guid;
+
+ uint64_t dds_objects_used; /* number of objects used */
+ uint64_t dds_objects_avail; /* number of objects available */
+
+ uint64_t dds_num_clones; /* number of clones of this */
+
+ /* The dataset's administratively-set quota, in bytes. */
+ uint64_t dds_quota;
+
+ /* The dataset's administratively-set reservation, in bytes */
+ uint64_t dds_reserved;
+
+ /*
+ * The amount of additional space that this dataset can consume.
+ * Takes into account quotas & reservations.
+ * (Assuming that no other datasets consume it first.)
+ */
+ uint64_t dds_available;
+
+ /*
+ * Various properties.
+ */
+ uint64_t dds_compression;
+ uint64_t dds_checksum;
+ uint64_t dds_zoned;
+ char dds_compression_setpoint[MAXNAMELEN];
+ char dds_checksum_setpoint[MAXNAMELEN];
+ char dds_zoned_setpoint[MAXNAMELEN];
+ char dds_altroot[MAXPATHLEN];
+
+ /* The following are for debugging purposes only */
+ uint64_t dds_last_txg;
+ uint64_t dds_dir_obj;
+ uint64_t dds_objset_obj;
+ uint64_t dds_clone_of_obj;
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* XXX */
+
+/*
+ * Synchronous write.
+ * On success returns 0 and fills in the blk pointed at by bp.
+ * Note that while the data covered by this function will be on stable
+ * storage when the function returns this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+ struct blkptr *bp, uint64_t txg);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+ uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+ uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+ dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
+int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+ struct vnode *vp, uint64_t voffset);
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 0000000000..b6e8b62ec2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define _SYS_DMU_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU. Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ *
+ * dp_config_rwlock
+ * must be held before: everything
+ * protects dd namespace changes
+ * protects property changes globally
+ * held from:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ * must be held before:
+ * everything except dp_config_rwlock
+ * protects os_obj_next
+ * held from:
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ * must be held before:
+ * everything except dp_config_rwlock and os_obj_lock
+ * protects structure of dnode (eg. nlevels)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * held from:
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dmu_tx_count_free:
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
+ * dbuf_findbp: (callers, phys? - the real need)
+ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ * dnode_sync/w (increase_indirection): db_mtx (phys)
+ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ * dnode_new_blkid/w: (dn_maxblkid)
+ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ * dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ * must be held before:
+ * db_mtx, hash_mutexes
+ * protects:
+ * dn_dbufs
+ * dn_evicted
+ * held from:
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ * must be held before:
+ * db_mtx
+ * protects dbuf_hash_table (global) and db_hash_next
+ * held from:
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ * must be held before:
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * protects:
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
+ * held from:
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ * protects:
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dd_assigned_tx
+ * dn_notxholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
+ * held from:
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
+ *
+ * dd_lock (leaf)
+ * protects:
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
+ * held from:
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
+ * dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ * protects:
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
+ * held from:
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock (leaf)
+ * protects:
+ * ds_user_ptr
+ * ds_user_evice_func
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
+ * held from:
+ * dsl_dataset_*
+ *
+ */
+
+struct objset;
+struct dmu_pool;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 0000000000..d0a77fcfb9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_OBJSET_H
+#define _SYS_DMU_OBJSET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dmu_tx;
+struct objset_impl;
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+ sizeof (uint64_t)];
+} objset_phys_t;
+
+struct objset {
+ struct objset_impl *os;
+ int os_mode;
+};
+
+typedef struct objset_impl {
+ /* Immutable: */
+ struct dsl_dataset *os_dsl_dataset;
+ spa_t *os_spa;
+ objset_phys_t *os_phys;
+ dnode_t *os_meta_dnode;
+ zilog_t *os_zil;
+ objset_t os;
+ uint8_t os_checksum; /* can change, under dsl_dir's locks */
+ uint8_t os_compress; /* can change, under dsl_dir's locks */
+ uint8_t os_md_checksum;
+ uint8_t os_md_compress;
+
+ /* no lock needed: */
+ struct dmu_tx *os_synctx; /* XXX sketchy */
+ blkptr_t os_rootbp;
+
+ /* Protected by os_obj_lock */
+ kmutex_t os_obj_lock;
+ uint64_t os_obj_next;
+
+ /* Protected by os_lock */
+ kmutex_t os_lock;
+ list_t os_dirty_dnodes[TXG_SIZE];
+ list_t os_free_dnodes[TXG_SIZE];
+ list_t os_dnodes;
+ list_t os_downgraded_dbufs;
+} objset_impl_t;
+
+#define DMU_PRIVATE_OBJECT (1ULL << 63)
+
+#define DMU_META_DNODE_OBJECT (1ULL << 63)
+
+/* XXX rename this to DMU_IS_DNODE_OBJECT? */
+#define IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+
+/* called from zpl */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+/* called from dsl */
+void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+ dmu_objset_type_t type, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
+ blkptr_t *bp);
+void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 0000000000..7087912e00
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TRAVERSE_H
+#define _SYS_DMU_TRAVERSE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/arc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ADVANCE_POST 0 /* post-order traversal */
+#define ADVANCE_PRE 0x01 /* pre-order traversal */
+#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
+#define ADVANCE_DATA 0x04 /* read user data blocks */
+#define ADVANCE_HOLES 0x08 /* visit holes */
+#define ADVANCE_NOLOCK 0x10 /* Don't grab SPA sync lock */
+
+#define ZB_NO_LEVEL -2
+#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
+#define ZB_MAXBLKID (1ULL << 62)
+#define ZB_MAXOBJSET (1ULL << 62)
+#define ZB_MAXOBJECT (1ULL << 62)
+
+#define ZB_MOS_CACHE 0
+#define ZB_MDN_CACHE 1
+#define ZB_DN_CACHE 2
+#define ZB_DEPTH 3
+
+typedef struct zbookmark {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int zb_level;
+ uint64_t zb_blkid;
+} zbookmark_t;
+
+typedef struct zseg {
+ uint64_t seg_mintxg;
+ uint64_t seg_maxtxg;
+ zbookmark_t seg_start;
+ zbookmark_t seg_end;
+ list_node_t seg_node;
+} zseg_t;
+
+typedef struct traverse_blk_cache {
+ zbookmark_t bc_bookmark;
+ blkptr_t bc_blkptr;
+ void *bc_data;
+ dnode_phys_t *bc_dnode;
+ int bc_errno;
+ int bc_pad1;
+ uint64_t bc_pad2;
+} traverse_blk_cache_t;
+
+typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
+
+struct traverse_handle {
+ spa_t *th_spa;
+ blkptr_cb_t *th_func;
+ void *th_arg;
+ int th_advance;
+ int th_zio_flags;
+ list_t th_seglist;
+ traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+ uint64_t th_hits;
+ uint64_t th_arc_hits;
+ uint64_t th_reads;
+ uint64_t th_callbacks;
+ uint64_t th_syncs;
+ uint64_t th_restarts;
+ zbookmark_t th_noread;
+ zbookmark_t th_lastcb;
+};
+
+int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+ int advance, blkptr_cb_t func, void *arg);
+
+traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
+ int advance, int zio_flags);
+void traverse_fini(traverse_handle_t *th);
+
+void traverse_add_dnode(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
+void traverse_add_objset(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
+void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+
+int traverse_more(traverse_handle_t *th);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 0000000000..5d2f1127ce
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TX_H
+#define _SYS_DMU_TX_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+ /*
+ * No synchronization is needed because a tx can only be handled
+ * by one thread.
+ */
+ list_t tx_holds; /* list of dmu_tx_hold_t */
+ objset_t *tx_objset;
+ struct dsl_dir *tx_dir;
+ struct dsl_pool *tx_pool;
+ uint64_t tx_txg;
+ txg_handle_t tx_txgh;
+ uint64_t tx_space_towrite;
+ refcount_t tx_space_written;
+ uint64_t tx_space_tofree;
+ refcount_t tx_space_freed;
+ uint64_t tx_space_tooverwrite;
+ void *tx_tempreserve_cookie;
+ uint8_t tx_anyobj;
+ uint8_t tx_privateobj;
+#ifdef ZFS_DEBUG
+ char *tx_debug_buf;
+ int tx_debug_len;
+#endif
+};
+
+enum dmu_tx_hold_type {
+ THT_NEWOBJECT,
+ THT_WRITE,
+ THT_BONUS,
+ THT_FREE,
+ THT_ZAP,
+ THT_SPACE,
+ THT_NUMTYPES
+};
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+
+typedef struct dmu_tx_hold {
+ list_node_t dth_node;
+ struct dnode *dth_dnode;
+ enum dmu_tx_hold_type dth_type;
+ dmu_tx_hold_func_t dth_func;
+ uint64_t dth_arg1;
+ uint64_t dth_arg2;
+ /* XXX track what the actual estimates were for this hold */
+} dmu_tx_hold_t;
+
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_ds(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+
+extern int dmu_use_tx_debug_bufs;
+
+#define dprintf_tx(tx, fmt, ...) \
+ if (dmu_use_tx_debug_bufs) \
+ do { \
+ char *__bufp; \
+ int __len; \
+ if (tx->tx_debug_buf == NULL) { \
+ __bufp = kmem_zalloc(4096, KM_SLEEP); \
+ tx->tx_debug_buf = __bufp; \
+ tx->tx_debug_len = __len = 4096; \
+ } else { \
+ __len = tx->tx_debug_len; \
+ __bufp = &tx->tx_debug_buf[4096-__len]; \
+ } \
+ tx->tx_debug_len -= snprintf(__bufp, __len, fmt, __VA_ARGS__); \
+_NOTE(CONSTCOND) } while (0); \
+ else dprintf(fmt, __VA_ARGS__)
+
+#else
+
+#define dprintf_tx(tx, fmt, ...)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TX_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 0000000000..35466d6874
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DFETCH_H
+#define _DFETCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint64_t zfetch_array_rd_sz;
+
+struct dnode; /* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+ ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
+ ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+ uint64_t zst_offset; /* offset of starting block in range */
+ uint64_t zst_len; /* length of range, in blocks */
+ zfetch_dirn_t zst_direction; /* direction of prefetch */
+ uint64_t zst_stride; /* length of stride, in blocks */
+ uint64_t zst_ph_offset; /* prefetch offset, in blocks */
+ uint64_t zst_cap; /* prefetch limit (cap), in blocks */
+ kmutex_t zst_lock; /* protects stream */
+ clock_t zst_last; /* lbolt of last prefetch */
+ avl_node_t zst_node; /* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* AVL tree of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+ uint32_t zf_stream_cnt; /* # of active streams */
+ uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void dmu_zfetch_init(zfetch_t *, struct dnode *);
+void dmu_zfetch_rele(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DFETCH_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 0000000000..2a5ef92b52
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DNODE_H
+#define _SYS_DNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 10 /* 1k */
+#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define DNODE_SIZE (1 << DNODE_SHIFT)
+#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+#define DN_META_DNODE_LEVELS \
+ (1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT - \
+ DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_MAX_OBJECT \
+ ((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT + \
+ (DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset_impl;
+struct zio;
+
+enum dnode_dirtycontext {
+ DN_UNDIRTIED,
+ DN_DIRTY_OPEN,
+ DN_DIRTY_SYNC
+};
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_pad1[1];
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_pad2[4];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_secphys; /* 512b sectors of disk space used */
+
+ uint64_t dn_pad3[4];
+
+ blkptr_t dn_blkptr[1];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef struct dnode {
+ /*
+ * lock ordering:
+ *
+ * db_mtx > dn_dirty_mtx
+ * dbuf_syncdone
+ *
+ * dn_struct_rwlock/r > dn_dirty_mtx
+ * dmu_object_info
+ *
+ * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
+ * dbuf_dirty
+ * dbuf_setdirty
+ *
+ * dn_struct_rwlock/w > db_mtx > dn_mtx
+ * dnode_increase_indirection -> dbuf_find
+ * dbuf_hold_impl
+ * dnode_set_bonus
+ *
+ * dn_struct_rwlock/w > dn_mtx
+ * dnode_increase_indirection
+ *
+ * dn_dirty_mtx > dn_mtx
+ * dnode_buf_pageout
+ *
+ * db_mtx > dn_mtx
+ * dbuf_create
+ */
+
+ /*
+ * dn_struct_rwlock protects the structure of the dnode.
+ * In particular, it protects the number of levels of indirection.
+ */
+ krwlock_t dn_struct_rwlock;
+
+ /*
+ * Our link on dataset's dd_dnodes list.
+ * Protected by dd_accounting_mtx.
+ */
+ list_node_t dn_link;
+
+ /* immutable: */
+ struct objset_impl *dn_objset;
+ uint64_t dn_object;
+ struct dmu_buf_impl *dn_dbuf;
+ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+ /*
+ * Copies of stuff in dn_phys. They're valid here even before
+ * the dnode is first synced.
+ */
+ dmu_object_type_t dn_type; /* object type (immutable) */
+ uint8_t dn_bonustype; /* bonus type (immutable) */
+ uint16_t dn_bonuslen; /* bonus length (immutable) */
+ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint32_t dn_datablksz; /* in bytes */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+
+ /*
+ * The following are kept up-to-date in the *open* context, the syncing
+ * context should only pay attention to the dn_next_* values.
+ */
+ uint8_t dn_nlevels;
+ uint8_t dn_indblkshift;
+
+ uint8_t dn_next_nlevels[TXG_SIZE];
+ uint8_t dn_next_indblkshift[TXG_SIZE];
+
+ /* protected by os_lock: */
+ uint32_t dn_dirtyblksz[TXG_SIZE]; /* dirty block size in bytes */
+ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
+
+ /* protected by dn_mtx: */
+ kmutex_t dn_mtx;
+ list_t dn_dirty_dbufs[TXG_SIZE];
+ uint64_t dn_maxblkid;
+ avl_tree_t dn_ranges[TXG_SIZE];
+ uint64_t dn_allocated_txg;
+ uint64_t dn_free_txg;
+ uint64_t dn_assigned_txg;
+ struct dmu_tx *dn_assigned_tx; /* if only one tx cares */
+ kcondvar_t dn_notxholds;
+ enum dnode_dirtycontext dn_dirtyctx;
+ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
+
+ /* protected by own devices */
+ refcount_t dn_tx_holds;
+ refcount_t dn_holds;
+
+ kmutex_t dn_dbufs_mtx;
+ list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ kcondvar_t dn_evicted; /* a child dbuf has been evicted */
+
+ /*
+ * Performance hack: whenever we have a hold on the bonus buffer of a
+ * ZAP object, we will also have a hold on db0. This will keep the
+ * meta-data for a micro-zap object cached as long as the znode for the
+ * object is in the znode cache.
+ */
+ struct dmu_buf_impl *dn_db0;
+
+ /* holds prefetch structure */
+ struct zfetch dn_zfetch;
+} dnode_t;
+
+typedef struct free_range {
+ avl_node_t fr_node;
+ uint64_t fr_blkid;
+ uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+ uint64_t object);
+void dnode_special_close(dnode_t *dn);
+
+dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
+dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+ void *ref);
+void dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+uint64_t dnode_max_nonzero_offset(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+ uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
+ uint64_t blkfill);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dnode(dn, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dn)->dn_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj);\
+ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+ __db_buf, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define dprintf_dnode(db, fmt, ...)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 0000000000..e56c8a67d9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DATASET_H
+#define _SYS_DSL_DATASET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj;
+ uint64_t ds_prev_snap_obj;
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj;
+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj;
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relavent to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_restoring; /* boolean */
+ blkptr_t ds_bp;
+ uint64_t ds_pad[8]; /* pad out to 256 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+ /* Immutable: */
+ struct dsl_dir *ds_dir;
+ dsl_dataset_phys_t *ds_phys;
+ dmu_buf_t *ds_dbuf;
+ uint64_t ds_object;
+
+ /* only used in syncing context: */
+ struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+
+ /* has internal locking: */
+ bplist_t ds_deadlist;
+
+ /* protected by lock on pool's dp_dirty_datasets list */
+ txg_node_t ds_dirty_link;
+ list_node_t ds_synced_link;
+
+ /*
+ * ds_phys->ds_<accounting> is also protected by ds_lock.
+ * Protected by ds_lock:
+ */
+ kmutex_t ds_lock;
+ void *ds_user_ptr;
+ dsl_dataset_evict_func_t *ds_user_evict_func;
+ uint64_t ds_open_refcount;
+
+ /* Protected by ds_lock; keep at end of struct for better locality */
+ char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+#define dsl_dataset_is_snapshot(ds) \
+ ((ds)->ds_phys->ds_num_children != 0)
+
+int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_open(const char *name, int mode, void *tag,
+ dsl_dataset_t **dsp);
+dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+ const char *tail, int mode, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
+int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
+int dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_destroy(const char *name);
+int dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rollback(const char *name);
+int dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rename(const char *name, const char *newname);
+
+void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func);
+void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+
+void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
+ dmu_tx_t *tx);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
+struct dsl_pool *dsl_dataset_pool(dsl_dataset_t *ds);
+
+void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
+ dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define dprintf_ds(ds, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ dsl_dataset_name(ds, __ds_name); \
+ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 0000000000..0499d731e6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DIR_H
+#define _SYS_DSL_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time;
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_clone_parent_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+ /* These are immutable; no lock needed: */
+ uint64_t dd_object;
+ dsl_dir_phys_t *dd_phys;
+ dmu_buf_t *dd_dbuf;
+ dsl_pool_t *dd_pool;
+
+ /* protected by lock on pool's dp_dirty_dirs list */
+ txg_node_t dd_dirty_link;
+
+ /* protected by dp_config_rwlock */
+ dsl_dir_t *dd_parent;
+
+ /* Protected by dd_lock */
+ kmutex_t dd_lock;
+ list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+ /* Thing to do when we sync */
+ uint64_t dd_sync_txg;
+ int (*dd_sync_func)(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+ void *dd_sync_arg;
+ int dd_sync_err;
+
+ /* Accounting */
+ /* reflects any changes to dd_phys->dd_used_bytes made this syncing */
+ int64_t dd_used_bytes;
+ /* int64_t dd_compressed_bytes; */
+ /* int64_t dd_uncompressed_bytes; */
+ /* gross estimate of space used by in-flight tx's */
+ uint64_t dd_tempreserved[TXG_SIZE];
+ /* amount of space we expect to write; == amount of dirty data */
+ int64_t dd_space_towrite[TXG_SIZE];
+
+ /* protected by dd_lock; keep at end of struct for better locality */
+ char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
+dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+ const char **tailp);
+dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_is_private(dsl_dir_t *dd);
+int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
+void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+int dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx);
+void dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+int dsl_dir_sync_task(dsl_dir_t *dd,
+ int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space);
+int dsl_dir_set_quota(const char *ddname, uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define dprintf_dd(dd, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ dsl_dir_name(dd, __ds_name); \
+ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 0000000000..4fca4548ad
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_POOL_H
+#define _SYS_DSL_POOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+
+typedef struct dsl_pool {
+ /* Immutable */
+ spa_t *dp_spa;
+ struct objset *dp_meta_objset;
+ struct dsl_dir *dp_root_dir;
+ struct dsl_dir *dp_mos_dir;
+ uint64_t dp_root_dir_obj;
+
+ /* No lock needed - sync context only */
+ blkptr_t dp_meta_rootbp;
+ list_t dp_synced_objsets;
+
+ /* Has its own locking */
+ tx_state_t dp_tx;
+ txg_list_t dp_dirty_datasets;
+ txg_list_t dp_dirty_dirs;
+
+ /*
+ * Protects administrative changes (properties, namespace)
+ * It is only held for write in syncing context. Therefore
+ * syncing context does not need to ever have it for read, since
+ * nobody else could possibly have it for write.
+ */
+ krwlock_t dp_config_rwlock;
+} dsl_pool_t;
+
+dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_zil_clean(dsl_pool_t *dp);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 0000000000..ea810b03ab
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_PROP_H
+#define _SYS_DSL_PROP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+#define DSL_PROP_VALUE_UNDEFINED (-1ULL)
+
+typedef struct dsl_prop_cb_record {
+ list_node_t cbr_node; /* link on dd_prop_cbs */
+ const char *cbr_propname;
+ dsl_prop_changed_cb_t *cbr_func;
+ void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_string(const char *ddname, const char *propname,
+ char *value, int valuelen, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint);
+int dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+ uint64_t *valuep, char *setpoint);
+
+int dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_PROP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 0000000000..e592b388fd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define _SYS_METASLAB_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct metaslab_class metaslab_class_t;
+typedef struct metaslab_group metaslab_group_t;
+
+extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+
+extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg);
+extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(void);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
+extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+ vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight);
+extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp);
+extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 0000000000..5b1e388727
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define _SYS_METASLAB_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+ metaslab_group_t *mc_rotor;
+ uint64_t mc_allocated;
+};
+
+struct metaslab_group {
+ kmutex_t mg_lock;
+ avl_tree_t mg_metaslab_tree;
+ uint64_t mg_aliquot;
+ int64_t mg_bias;
+ metaslab_class_t *mg_class;
+ vdev_t *mg_vd;
+ metaslab_group_t *mg_prev;
+ metaslab_group_t *mg_next;
+};
+
+/*
+ * Each metaslab's free block list is kept in its own DMU object in the
+ * metaslab freelist dataset. To minimize space consumption, the list
+ * is circular.
+ *
+ * Allocations and frees can happen in multiple transaction groups at
+ * the same time, which makes it a bit challening to keep the metaslab
+ * consistent. For example, we cannot allow frees from different
+ * transaction groups to be interleaved in the metaslab's free block list.
+ *
+ * We address this in several ways:
+ *
+ * We don't allow allocations from the same metaslab in concurrent
+ * transaction groups. metaslab_alloc() enforces this by checking
+ * the ms_last_alloc field, which specifies the last txg in which
+ * the metaslab was used for allocations.
+ *
+ * We can't segregate frees this way because we can't choose which
+ * DVAs someone wants to free. So we keep separate in-core freelists
+ * for each active transaction group. This in-core data is only
+ * written to the metaslab's on-disk freelist in metaslab_sync(),
+ * which solves the interleave problem: we only append frees from
+ * the syncing txg to the on-disk freelist, so the appends all occur
+ * in txg order.
+ *
+ * We cannot allow a block which was freed in a given txg to be
+ * allocated again until that txg has closed; otherwise, if we
+ * failed to sync that txg and had to roll back to txg - 1,
+ * changes in txg + 1 could have overwritten the data. Therefore,
+ * we partition the free blocks into "available" and "limbo" states.
+ * A block is available if the txg in which it was freed has closed;
+ * until then, the block is in limbo. Each time metaslab_sync() runs,
+ * if first adds any limbo blocks to the avail list, clears the limbo
+ * list, and starts writing the new limbo blocks (i.e. the ones that
+ * were freed in the syncing txg).
+ */
+
+struct metaslab {
+ kmutex_t ms_lock; /* metaslab lock */
+ space_map_obj_t *ms_smo; /* space map object */
+ uint64_t ms_last_alloc; /* txg of last alloc */
+ uint64_t ms_usable_end; /* end of free_obj at last sync */
+ uint64_t ms_usable_space; /* usable space at last sync */
+ metaslab_group_t *ms_group; /* metaslab group */
+ avl_node_t ms_group_node; /* node in metaslab group tree */
+ uint64_t ms_weight; /* weight vs. others in group */
+ uint8_t ms_dirty[TXG_SIZE]; /* per-txg dirty flags */
+ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
+ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+ space_map_t ms_map; /* in-core free space map */
+ uint8_t ms_map_incore; /* space map contents are valid */
+ uint64_t ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD */
+};
+
+/*
+ * ms_dirty[] flags
+ */
+#define MSD_ALLOC 0x01 /* allocated from in this txg */
+#define MSD_FREE 0x02 /* freed to in this txg */
+#define MSD_ADD 0x04 /* added to the pool in this txg */
+#define MSD_CONDENSE 0x08 /* condensed in this txg */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 0000000000..f9fffd2443
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_REFCOUNT_H
+#define _SYS_REFCOUNT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define FTAG ((void*)__func__)
+
+#if defined(DEBUG) || !defined(_KERNEL)
+typedef struct reference {
+ list_node_t ref_link;
+ void *ref_holder;
+ uint64_t ref_number;
+ uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+ kmutex_t rc_mtx;
+ list_t rc_list;
+ list_t rc_removed;
+ int64_t rc_count;
+ int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t should be initialized to zero before use. */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* DEBUG */
+
+typedef struct refcount {
+ uint64_t rc_count;
+} refcount_t;
+
+#define refcount_create(rc) ((rc)->rc_count = 0)
+#define refcount_destroy(rc) ((rc)->rc_count = 0)
+#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define refcount_count(rc) ((rc)->rc_count)
+#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define refcount_add_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, number)
+#define refcount_remove_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, -number)
+
+#define refcount_init()
+#define refcount_fini()
+
+#endif /* DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 0000000000..9bf0f89d49
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define _SYS_SPA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct zilog zilog_t;
+typedef struct traverse_handle traverse_handle_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ val, low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ val, low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_MAXBLOCKSHIFT 17
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * E endianness
+ * type DMU object type
+ * lvl level of indirection
+ * birth txg transaction group in which the block was born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+ dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[3]; /* Extra space for the future */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_HOLE(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_pad[2] = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (-1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#include <sys/dmu.h>
+
+/*
+ * Routines found in spa.c
+ */
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
+extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool);
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+ int replacing);
+extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
+ int replace_done);
+extern void spa_vdev_replace_done(spa_t *spa);
+
+/* scrubbing */
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
+extern void spa_scrub_suspend(spa_t *spa);
+extern void spa_scrub_resume(spa_t *spa);
+extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+extern void spa_config_sync(void);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int getstats);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+/* Pool configuration lock */
+extern void spa_config_enter(spa_t *spa, krw_t rw);
+extern void spa_config_exit(spa_t *spa);
+extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Accessor functions */
+extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
+extern int spa_traverse_wanted(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern int spa_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+struct metaslab_class;
+extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
+extern uint64_t spa_get_alloc(spa_t *spa);
+extern uint64_t spa_get_space(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern int spa_busy(void);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern void sprintf_blkptr(char *buf, blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_evict_all(void);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+
+#ifdef ZFS_DEBUG
+#define dprintf_bp(bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __blkbuf[200]; \
+ sprintf_blkptr(__blkbuf, (bp)); \
+ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode; /* mode, e.g. FREAD | FWRITE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 0000000000..0fcef6c48b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define _SYS_SPA_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ uint64_t scl_count;
+ kthread_t *scl_writer;
+ kcondvar_t scl_cv;
+} spa_config_lock_t;
+
+struct spa {
+ /*
+ * Fields protected by spa_namespace_lock.
+ */
+ char *spa_name;
+ avl_node_t spa_avl;
+ int spa_anon;
+ nvlist_t *spa_config;
+ uint64_t spa_config_txg; /* txg of last config change */
+ spa_config_lock_t spa_config_lock; /* configuration changes */
+ kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */
+ int spa_sync_pass; /* iterate-to-convergence */
+ int spa_state; /* pool state */
+ uint8_t spa_minref; /* min refcnt of open pool */
+ uint8_t spa_traverse_wanted; /* traverse lock wanted */
+ taskq_t *spa_vdev_retry_taskq;
+ taskq_t *spa_zio_issue_taskq[ZIO_TYPES];
+ taskq_t *spa_zio_intr_taskq[ZIO_TYPES];
+ dsl_pool_t *spa_dsl_pool;
+ metaslab_class_t *spa_normal_class; /* normal data class */
+ uint64_t spa_first_txg; /* first txg after spa_open() */
+ uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
+ vdev_t *spa_root_vdev; /* top-level vdev container */
+ list_t spa_dirty_list; /* vdevs with dirty labels */
+ uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_syncing_txg; /* txg currently syncing */
+ uint64_t spa_sync_bplist_obj; /* object for deferred frees */
+ bplist_t spa_sync_bplist; /* deferred-free bplist */
+ krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */
+ uberblock_t spa_ubsync; /* last synced uberblock */
+ uberblock_t spa_uberblock; /* current uberblock */
+ kmutex_t spa_scrub_lock; /* resilver/scrub lock */
+ kthread_t *spa_scrub_thread; /* scrub/resilver thread */
+ traverse_handle_t *spa_scrub_th; /* scrub traverse handle */
+ uint64_t spa_scrub_restart_txg; /* need to restart */
+ uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */
+ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ uint64_t spa_scrub_errors; /* scrub I/O error count */
+ kcondvar_t spa_scrub_cv; /* scrub thread state change */
+ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
+ uint8_t spa_scrub_stop; /* tell scrubber to stop */
+ uint8_t spa_scrub_suspend; /* tell scrubber to suspend */
+ uint8_t spa_scrub_active; /* active or suspended? */
+ uint8_t spa_scrub_type; /* type of scrub we're doing */
+ int spa_sync_on; /* sync threads are running */
+ char *spa_root; /* alternate root directory */
+ kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */
+ /*
+ * spa_refcnt must be the last element because it changes size based on
+ * compilation options. In order for the MDB module to function
+ * correctly, the other fields must remain in the same location.
+ */
+ refcount_t spa_refcount; /* number of opens */
+};
+
+extern const char *spa_config_dir;
+extern kmutex_t spa_namespace_lock;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 0000000000..9f0cf83c9a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define _SYS_SPACE_MAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map {
+ avl_tree_t sm_root; /* Root of the AVL tree */
+ uint64_t sm_start; /* Start of map (inclusive) */
+ uint64_t sm_end; /* End of map (exclusive) */
+ uint64_t sm_size; /* Size of map (end - start) */
+ uint64_t sm_shift; /* Unit shift */
+ uint64_t sm_space; /* Sum of all segments in the map */
+ kmutex_t *sm_lock; /* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+ avl_node_t ss_node; /* AVL node */
+ uint64_t ss_start; /* starting offset of this segment */
+ uint64_t ss_end; /* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_map_obj {
+ uint64_t smo_object; /* on-disk space map object */
+ uint64_t smo_objsize; /* size of the object */
+ uint64_t smo_alloc; /* space allocated from the map */
+} space_map_obj_t;
+
+/*
+ * debug entry
+ *
+ * 1 3 10 50
+ * ,---+--------+------------+---------------------------------.
+ * | 1 | action | syncpass | txg (lower bits) |
+ * `---+--------+------------+---------------------------------'
+ * 63 62 60 59 50 49 0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ * 1 47 1 15
+ * ,-----------------------------------------------------------.
+ * | 0 | offset (sm_shift units) | type | run |
+ * `-----------------------------------------------------------'
+ * 63 62 17 16 15 0
+ */
+
+/* All this stuff takes and returns bytes */
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
+#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
+#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
+
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+
+#define SM_ALLOC 0x0
+#define SM_FREE 0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define SPACE_MAP_BLOCKSHIFT 12
+
+#define SPACE_MAP_CHUNKSIZE (1<<20)
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+ uint64_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_iterate(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_merge(space_map_t *dest, space_map_t *src);
+extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_union(space_map_t *smd, space_map_t *sms);
+
+extern int space_map_load(space_map_t *sm, space_map_obj_t *smo,
+ uint8_t maptype, objset_t *os, uint64_t end, uint64_t space);
+extern void space_map_sync(space_map_t *sm, space_map_t *dest,
+ space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx);
+extern void space_map_write(space_map_t *sm, space_map_obj_t *smo,
+ objset_t *os, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_MAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 0000000000..dae129c2e5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define _SYS_TXG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
+#define TXG_SIZE 4 /* next power of 2 */
+#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
+#define TXG_INITIAL TXG_SIZE /* initial txg */
+#define TXG_IDX (txg & TXG_MASK)
+
+#define TXG_WAIT 1ULL
+#define TXG_NOWAIT 2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+ tx_cpu_t *th_cpu;
+ uint64_t th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+ struct txg_node *tn_next[TXG_SIZE];
+ uint8_t tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+ kmutex_t tl_lock;
+ size_t tl_offset;
+ txg_node_t *tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_suspend(struct dsl_pool *dp);
+extern void txg_resume(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately). If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group. Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern int txg_stalled(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define TXG_CLEAN(txg) ((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 0000000000..45a138afaa
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define _SYS_TXG_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+ kmutex_t tc_lock;
+ kcondvar_t tc_cv[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE];
+ char tc_pad[16];
+};
+
+typedef struct tx_state {
+ tx_cpu_t *tx_cpu; /* protects right to enter txg */
+ kmutex_t tx_sync_lock; /* protects tx_state_t */
+ krwlock_t tx_suspend;
+ uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
+ uint64_t tx_syncing_txg; /* currently syncing txg id */
+ uint64_t tx_synced_txg; /* last synced txg id */
+
+ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
+ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+ kcondvar_t tx_sync_more_cv;
+ kcondvar_t tx_sync_done_cv;
+ kcondvar_t tx_quiesce_more_cv;
+ kcondvar_t tx_quiesce_done_cv;
+ kcondvar_t tx_timeout_exit_cv;
+ kcondvar_t tx_exit_cv; /* wait for all threads to exit */
+
+ uint8_t tx_threads; /* number of threads */
+ uint8_t tx_exiting; /* set when we're exiting */
+
+ kthread_t *tx_sync_thread;
+ kthread_t *tx_quiesce_thread;
+ kthread_t *tx_timelimit_thread;
+} tx_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock.h b/usr/src/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 0000000000..93d936ae4b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define _SYS_UBERBLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 0000000000..5bfcea097d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define _SYS_UBERBLOCK_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/uberblock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+
+#define UBERBLOCK_SHIFT (10)
+#define UBERBLOCK_SIZE (1ULL << UBERBLOCK_SHIFT)
+
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+
+#define UBERBLOCK_VERSION 1ULL
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* UBERBLOCK_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+};
+
+typedef struct uberblock_phys {
+ uberblock_t ubp_uberblock;
+ char ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
+ sizeof (zio_block_tail_t)];
+ zio_block_tail_t ubp_zbt;
+} uberblock_phys_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 0000000000..c8c177e3ca
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UNIQUE_H
+#define _SYS_UNIQUE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define UNIQUE_BITS 56
+
+void unique_init(void);
+
+/* Return a new unique value. */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 0000000000..4113ff2ca6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_H
+#define _SYS_VDEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Vdev knobs.
+ */
+typedef struct vdev_knob {
+ char *vk_name; /* knob name */
+ char *vk_desc; /* knob description */
+ uint64_t vk_min; /* minimum legal value */
+ uint64_t vk_max; /* maximum legal value */
+ uint64_t vk_default; /* default value */
+ size_t vk_offset; /* offset into vdev_t */
+} vdev_knob_t;
+
+/*
+ * Fault injection modes.
+ */
+#define VDEV_FAULT_NONE 0
+#define VDEV_FAULT_RANDOM 1
+#define VDEV_FAULT_COUNT 2
+
+extern int vdev_open(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg);
+extern void vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *, zio_t **zq);
+
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
+extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ int scrub_done);
+
+extern const char *vdev_description(vdev_t *vd);
+
+extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_stat_update(zio_t *zio);
+extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
+ boolean_t complete);
+extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
+extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
+ uint64_t alloc_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern void vdev_io_start(zio_t *zio);
+extern void vdev_io_done(zio_t *zio);
+
+extern int vdev_online(spa_t *spa, const char *path);
+extern int vdev_offline(spa_t *spa, const char *path);
+
+extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
+ uint64_t arg);
+extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
+extern int vdev_is_dead(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+
+extern nvlist_t *vdev_config_generate(vdev_t *vd, int getstats);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+int vdev_label_init(vdev_t *vd, uint64_t create_txg);
+extern int spa_sync_labels(spa_t *spa, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 0000000000..95536a77db
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define _SYS_VDEV_DISK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+ ddi_devid_t vd_devid;
+ char *vd_minor;
+ ldi_handle_t vd_lh;
+} vdev_disk_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_DISK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_file.h b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 0000000000..cd49673577
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define _SYS_VDEV_FILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+ vnode_t *vf_vnode;
+} vdev_file_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_FILE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 0000000000..4ae3467619
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define _SYS_VDEV_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_done_func_t(zio_t *zio);
+typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+
+typedef struct vdev_ops {
+ vdev_open_func_t *vdev_op_open;
+ vdev_close_func_t *vdev_op_close;
+ vdev_asize_func_t *vdev_op_asize;
+ vdev_io_start_func_t *vdev_op_io_start;
+ vdev_io_done_func_t *vdev_op_io_done;
+ vdev_state_change_func_t *vdev_op_state_change;
+ char vdev_op_type[16];
+ boolean_t vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+ char *ve_data;
+ uint64_t ve_offset;
+ uint64_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ uint64_t vc_size;
+ uint64_t vc_bshift;
+ uint64_t vc_blocksize;
+ uint64_t vc_max;
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+struct vdev_queue {
+ uint64_t vq_min_pending;
+ uint64_t vq_max_pending;
+ uint64_t vq_agg_limit;
+ uint64_t vq_time_shift;
+ uint64_t vq_ramp_rate;
+ avl_tree_t vq_deadline_tree;
+ avl_tree_t vq_read_tree;
+ avl_tree_t vq_write_tree;
+ avl_tree_t vq_pending_tree;
+ kmutex_t vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+ /*
+ * Common to all vdev types.
+ */
+ uint64_t vdev_id; /* child number in vdev parent */
+ uint64_t vdev_guid; /* unique ID for this vdev */
+ uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_ashift; /* block alignment shift */
+ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
+ vdev_ops_t *vdev_ops; /* vdev operations */
+ spa_t *vdev_spa; /* spa for this vdev */
+ void *vdev_tsd; /* type-specific data */
+ vdev_t *vdev_top; /* top-level vdev */
+ vdev_t *vdev_parent; /* parent vdev */
+ vdev_t **vdev_child; /* array of children */
+ uint64_t vdev_children; /* number of children */
+ space_map_t vdev_dtl_map; /* dirty time log in-core state */
+ space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
+ vdev_stat_t vdev_stat; /* virtual device statistics */
+
+ /*
+ * Top-level vdev state.
+ */
+ uint64_t vdev_ms_array; /* metaslab array object */
+ uint64_t vdev_ms_shift; /* metaslab size shift */
+ uint64_t vdev_ms_count; /* number of metaslabs */
+ metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_t **vdev_ms; /* metaslab array */
+ space_map_obj_t *vdev_smo; /* metaslab space map array */
+ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
+ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
+ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
+ uint8_t vdev_dirty[TXG_SIZE]; /* per-txg dirty flags */
+ int vdev_is_dirty; /* on config dirty list? */
+ list_node_t vdev_dirty_node; /* config dirty list */
+ zio_t *vdev_io_retry; /* I/O retry list */
+ list_t vdev_io_pending; /* I/O pending list */
+
+ /*
+ * Leaf vdev state.
+ */
+ uint64_t vdev_psize; /* physical device capacity */
+ space_map_obj_t vdev_dtl; /* dirty time log on-disk state */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ char *vdev_path; /* vdev path (if any) */
+ char *vdev_devid; /* vdev devid (if any) */
+ uint64_t vdev_fault_arg; /* fault injection paramater */
+ int vdev_fault_mask; /* zio types to fault */
+ uint8_t vdev_fault_mode; /* fault injection mode */
+ uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */
+ uint8_t vdev_offline; /* device taken offline? */
+ uint8_t vdev_detached; /* device detached? */
+ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
+
+ /*
+ * For DTrace to work in userland (libzpool) context, these fields must
+ * remain at the end of the structure. DTrace will use the kernel's
+ * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+ * larger in userland, the offsets for the rest fields would be
+ * incorrect.
+ */
+ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
+ kmutex_t vdev_dirty_lock; /* vdev_dirty[] */
+ kmutex_t vdev_io_lock; /* vdev_io_pending list */
+ kcondvar_t vdev_io_cv; /* vdev_io_pending list empty? */
+ kmutex_t vdev_stat_lock; /* vdev_stat */
+};
+
+#define VDEV_SKIP_SIZE (8 << 10)
+#define VDEV_BOOT_HEADER_SIZE (8 << 10)
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCKS ((128 << 10) >> UBERBLOCK_SHIFT)
+
+#define VDEV_BOOT_MAGIC 0x2f5b007b10c /* ZFS boot block */
+#define VDEV_BOOT_VERSION 1 /* version number */
+
+typedef struct vdev_boot_header {
+ uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
+ uint64_t vb_version; /* VDEV_BOOT_VERSION */
+ uint64_t vb_offset; /* start offset (bytes) */
+ uint64_t vb_size; /* size (bytes) */
+ char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+ zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+ char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
+ vdev_boot_header_t vl_boot_header; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 120K */
+ uberblock_phys_t vl_uberblock[VDEV_UBERBLOCKS]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * vdev_dirty[] flags
+ */
+#define VDD_ALLOC 0x01 /* allocated from in this txg */
+#define VDD_FREE 0x02 /* freed to in this txg */
+#define VDD_ADD 0x04 /* added to the pool in this txg */
+#define VDD_DTL 0x08 /* dirty time log entry in this txg */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+#define VDEV_ALLOC_LOAD 0
+#define VDEV_ALLOC_ADD 1
+
+/*
+ * Allocate or free a vdev
+ */
+extern vdev_t *vdev_alloc(spa_t *spa, nvlist_t *config, vdev_t *parent,
+ uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern int vdev_load(vdev_t *vd, int import);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+extern vdev_ops_t vdev_missing_ops;
+
+/*
+ * Common asize function
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 0000000000..94ad0ffebe
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,353 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_H
+#define _SYS_ZAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs. The name is
+ * a zero-terminated string of up to 256 bytes (including terminating
+ * NULL). The value is an array of integers (whose length is limited
+ * only by the size of the zapobj). The integers may be 1, 2, 4, or 8
+ * bytes long. Note that an 8-byte integer value can be used to store
+ * the location (object number) of another dmu object (which may be
+ * itself a zapobj). Note that you can use a zero-length attribute to
+ * store a single bit of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe. However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (23 bytes or less) names and short (23 bytes or less) values.
+ * The ZAP should be efficient enough so that the user does not need to
+ * cache these attributes.
+ *
+ * Using extremely long (~256 bytes or more) attribute names or values
+ * values will result in poor performance, due to the memcpy from the
+ * user's buffer into the ZAP object. This penalty can be avoided by
+ * creating an integer-type attribute to store an object number, and
+ * accessing that object using the DMU directly.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe. Operations
+ * on different zapobjs will be processed concurrently. Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 32 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length. If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0. * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'. If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value. If an
+ * attribute with the given name does not exist, it will be created. If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten. The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+
+/*
+ * Returns (in name) the name of the entry whose value
+ * (za_first_integer) is value, or ENOENT if not found. The string
+ * pointed to by name must be at least 256 bytes long.
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+
+typedef struct zap_cursor {
+ /* This structure is opaque! */
+ objset_t *zc_objset;
+ uint64_t zc_zapobj;
+ uint64_t zc_hash;
+ uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+ int za_integer_length;
+ uint64_t za_num_integers;
+ uint64_t za_first_integer; /* no sign extension for <8byte ints */
+ char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one. The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+
+/*
+ * Get the attribute currently pointed to by the cursor. Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor. The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value. The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument). You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+ uint64_t zapobj, uint64_t serialized);
+
+
+#define ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+ /*
+ * Size of the pointer table (in number of entries).
+ * This is always a power of 2, or zero if it's a microzap.
+ * In general, it should be considerably greater than zs_num_leafs.
+ */
+ uint64_t zs_ptrtbl_len;
+
+ uint64_t zs_blocksize; /* size of zap blocks */
+
+ uint64_t zs_num_leafs; /* The number of leaf blocks */
+
+ uint64_t zs_num_entries; /* The number of zap entries */
+
+ /*
+ * The number of blocks used. Note that some blocks may be
+ * wasted because old ptrtbl's and large name/value blocks are
+ * not reused. (Although their space is reclaimed, we don't
+ * reuse those offsets in the object.)
+ */
+ uint64_t zs_num_blocks;
+
+ /* The number of blocks used for large names or values */
+ uint64_t zs_num_blocks_large;
+
+ /*
+ * Histograms. For all histograms, the last index
+ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+ * than what can be represented. For example
+ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+ * of leafs with more than 45 entries.
+ */
+
+ /*
+ * zs_leafs_with_n_pointers[n] is the number of leafs with
+ * 2^n pointers to it.
+ */
+ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_chained[n] is the number of leafs with n
+ * chained blocks. zs_leafs_with_n_chained[0] (leafs with no
+ * chained blocks) should be very close to zs_num_leafs.
+ */
+ uint64_t zs_leafs_with_n_chained[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_entries[n] is the number of leafs with
+ * [n*5, (n+1)*5) entries. In the current implementation, there
+ * can be at most 55 entries in any block, but there may be
+ * fewer if the name or value is large, or the block is not
+ * completely full.
+ */
+ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_n_tenths_full[n] is the number of leafs whose
+ * fullness is in the range [n/10, (n+1)/10).
+ */
+ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_entries_using_n_chunks[n] is the number of entries which
+ * consume n 24-byte chunks. (Note, large names/values only use
+ * one chunk, but contribute to zs_num_blocks_large.)
+ */
+ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_buckets_with_n_entries[n] is the number of buckets (each
+ * leaf has 64 buckets) with n entries.
+ * zs_buckets_with_n_entries[1] should be very close to
+ * zs_num_entries.
+ */
+ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object. Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics. This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 0000000000..6593e20a14
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_IMPL_H
+#define _SYS_ZAP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZAP_MAGIC 0x2F52AB2AB
+
+#define ZAP_BLOCK_SHIFT 17
+
+#define ZAP_MAXCD (uint32_t)(-1)
+#define ZAP_HASHBITS 28
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSHIFT ZAP_BLOCK_SHIFT
+#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_pad[6];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+ avl_node_t mze_node;
+ int mze_chunkid;
+ uint64_t mze_hash;
+ mzap_ent_phys_t mze_phys;
+} mzap_ent_t;
+
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<ZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/* 1/2 the block size */
+#define ZAP_PTRTBL_MIN_SHIFT (ZAP_BLOCK_SHIFT - 3 - 1)
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ uint64_t zap_pad[8181];
+ uint64_t zap_leafs[1 << ZAP_PTRTBL_MIN_SHIFT];
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+ objset_t *zap_objset;
+ uint64_t zap_object;
+ struct dmu_buf *zap_dbuf;
+ krwlock_t zap_rwlock;
+ int zap_ismicro;
+ uint64_t zap_salt;
+ union {
+ struct {
+ zap_phys_t *zap_phys;
+
+ /*
+ * zap_num_entries_mtx protects
+ * zap_num_entries
+ */
+ kmutex_t zap_num_entries_mtx;
+ } zap_fat;
+ struct {
+ mzap_phys_t *zap_phys;
+ int16_t zap_num_entries;
+ int16_t zap_num_chunks;
+ int16_t zap_alloc_next;
+ avl_tree_t zap_avl;
+ } zap_micro;
+ } zap_u;
+} zap_t;
+
+#define zap_f zap_u.zap_fat
+#define zap_m zap_u.zap_micro
+
+uint64_t zap_hash(zap_t *zap, const char *name);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_pageout(dmu_buf_t *db, void *vmzap);
+
+void zap_print(zap_t *);
+struct zap_leaf *zap_create_leaf(zap_t *zd, dmu_tx_t *tx);
+void zap_destroy_leaf(zap_t *zap, struct zap_leaf *l, dmu_tx_t *tx);
+uint64_t zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx);
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+
+int fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 0000000000..aee70ae633
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_LEAF_H
+#define _SYS_ZAP_LEAF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zap;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+
+#define ZAP_LEAF_NUMCHUNKS 5118
+#define ZAP_LEAF_ARRAY_BYTES 21
+#define ZAP_LEAF_HASH_SHIFT 12
+#define ZAP_LEAF_HASH_NUMENTRIES (1 << ZAP_LEAF_HASH_SHIFT)
+#define ZAP_LLA_DATA_BYTES ((1 << ZAP_BLOCK_SHIFT) - 16)
+
+typedef enum zap_entry_type {
+ ZAP_LEAF_FREE = 253,
+ ZAP_LEAF_ENTRY = 252,
+ ZAP_LEAF_ARRAY = 251,
+ ZAP_LEAF_TYPE_MAX = 250
+} zap_entry_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lhr_block_type; /* ZBT_LEAF */
+ uint64_t lhr_next; /* next block in leaf chain */
+ uint64_t lhr_prefix;
+ uint32_t lhr_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lhr_nfree; /* number free chunks */
+ uint16_t lhr_nentries; /* number of entries */
+ uint16_t lhr_prefix_len;
+
+#define lh_block_type l_phys->l_hdr.lhr_block_type
+#define lh_magic l_phys->l_hdr.lhr_magic
+#define lh_next l_phys->l_hdr.lhr_next
+#define lh_prefix l_phys->l_hdr.lhr_prefix
+#define lh_nfree l_phys->l_hdr.lhr_nfree
+#define lh_prefix_len l_phys->l_hdr.lhr_prefix_len
+#define lh_nentries l_phys->l_hdr.lhr_nentries
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_pad2[12];
+ } l_hdr; /* 2 24-byte chunks */
+
+ uint16_t l_hash[ZAP_LEAF_HASH_NUMENTRIES];
+ /* 170 24-byte chunks plus 16 bytes leftover space */
+
+ union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_LEAF_ENTRY */
+ uint8_t le_int_size; /* size of ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_length; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type;
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_LEAF_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+ } l_chunk[ZAP_LEAF_NUMCHUNKS];
+} zap_leaf_phys_t;
+
+typedef struct zap_leaf {
+ krwlock_t l_rwlock; /* only used on head of chain */
+ uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
+ struct zap_leaf *l_next; /* next in chain */
+ dmu_buf_t *l_dbuf;
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+ /* below is set by zap_leaf.c and is public to zap.c */
+ uint64_t zeh_num_integers;
+ uint64_t zeh_hash;
+ uint32_t zeh_cd;
+ uint8_t zeh_integer_size;
+
+ /* below is private to zap_leaf.c */
+ uint16_t zeh_fakechunk;
+ uint16_t *zeh_chunkp;
+ zap_leaf_t *zeh_head_leaf;
+ zap_leaf_t *zeh_found_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found. The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute. Integer size
+ * conversion will be done without sign extension. Return EINVAL if
+ * integer_size is too small. Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value). Fills in the
+ * entry handle on success. Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l,
+ const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf);
+
+extern zap_leaf_t *zap_leaf_split(struct zap *zap, zap_leaf_t *l, dmu_tx_t *tx);
+
+extern int zap_leaf_merge(zap_leaf_t *l, zap_leaf_t *sibling);
+
+extern zap_leaf_t *zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl);
+
+extern int zap_leaf_advance(zap_leaf_t *l, zap_cursor_t *zc);
+
+extern void zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 0000000000..0050316eb5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ACL_H
+#define _SYS_FS_ZFS_ACL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define ACCESS_UNDETERMINED -1
+
+#define ACE_SLOT_CNT 6
+
+typedef struct zfs_znode_acl {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_state; /* goop */
+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+#define ACL_DATA_ALLOCED 0x1
+
+/*
+ * Max ACL size is prepended deny for all entries + the
+ * canonical six tacked on * the end.
+ */
+#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6)
+
+typedef struct zfs_acl {
+ int z_slots; /* number of allocated slots for ACEs */
+ int z_acl_count;
+ uint_t z_state;
+ ace_t *z_acl;
+} zfs_acl_t;
+
+#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define DISCARD 0
+#define NOALLOW 1
+#define GROUPMASK 2
+#define PASSTHROUGH 3
+#define SECURE 4
+
+struct znode;
+
+#ifdef _KERNEL
+void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
+ dmu_tx_t *, cred_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
+int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+void zfs_acl_rele(void *);
+void zfs_ace_byteswap(ace_t *, int);
+extern int zfs_zaccess(struct znode *, int, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+ struct znode *, struct znode *, cred_t *cr);
+int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
+void zfs_acl_free(zfs_acl_t *);
+zfs_acl_t *zfs_acl_node_read(struct znode *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _SYS_FS_ZFS_ACL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 0000000000..2f0e3e792d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define _SYS_ZFS_CONTEXT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+
+#define CPU_SEQID (CPU->cpu_seqid)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 0000000000..78d82ccbe2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_CTLDIR_H
+#define _ZFS_CTLDIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CTLDIR_NAME ".zfs"
+
+#define zfs_has_ctldir(zdp) \
+ ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+ ((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define zfs_show_ctldir(zdp) \
+ (zfs_has_ctldir(zdp) && \
+ ((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr);
+
+int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen,
+ fid_t *fidp);
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define ZFSCTL_INO_ROOT 0x1
+#define ZFSCTL_INO_SNAPDIR 0x2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CTLDIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 0000000000..07eb3d2da8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define _SYS_ZFS_DEBUG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define ZFS_DEBUG_DPRINTF 0x0001
+#define ZFS_DEBUG_DBUF_VERIFY 0x0002
+#define ZFS_DEBUG_DNODE_VERIFY 0x0004
+#define ZFS_DEBUG_SNAPNAMES 0x0008
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+ int line, const char *fmt, ...);
+#define dprintf(...) \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 0000000000..8ab760f618
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_DIR_H
+#define _SYS_FS_ZFS_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define ZNEW 0x0001 /* entry should not exist */
+#define ZEXISTS 0x0002 /* entry should exist */
+#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
+#define ZXATTR 0x0008 /* we want the xattr dir */
+#define ZRENAMING 0x0010 /* znode is being renamed */
+
+/* mknode flags */
+#define IS_ROOT_NODE 0x01 /* create a root node */
+#define IS_XATTR 0x02 /* create an extended attribute node */
+#define IS_REPLAY 0x04 /* we are replaying intent log */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+ int);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, int *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **);
+extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
+ dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_dq_add(znode_t *, dmu_tx_t *);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 0000000000..cbe8bbc5cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_H
+#define _SYS_ZFS_IOCTL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_DRIVER_NAME "zfs"
+#define ZFS_DS_TYPE "zfs"
+
+/*
+ * Property values for snapdir
+ */
+#define HIDDEN 0
+#define VISIBLE 1
+
+typedef struct zfs_stats {
+ uint64_t zs_atime;
+ uint64_t zs_recordsize;
+ uint64_t zs_readonly;
+ uint64_t zs_devices;
+ uint64_t zs_exec;
+ uint64_t zs_setuid;
+ uint64_t zs_snapdir;
+ uint64_t zs_acl_mode;
+ uint64_t zs_acl_inherit;
+ char zs_mountpoint[MAXNAMELEN];
+ char zs_atime_setpoint[MAXNAMELEN];
+ char zs_recordsize_setpoint[MAXNAMELEN];
+ char zs_readonly_setpoint[MAXNAMELEN];
+ char zs_devices_setpoint[MAXNAMELEN];
+ char zs_setuid_setpoint[MAXNAMELEN];
+ char zs_exec_setpoint[MAXNAMELEN];
+ char zs_mountpoint_setpoint[MAXNAMELEN];
+ char zs_sharenfs[MAXNAMELEN];
+ char zs_sharenfs_setpoint[MAXNAMELEN];
+ char zs_snapdir_setpoint[MAXNAMELEN];
+ char zs_acl_mode_setpoint[MAXNAMELEN];
+ char zs_acl_inherit_setpoint[MAXNAMELEN];
+} zfs_stats_t;
+
+#define DMU_BACKUP_VERSION (1ULL)
+#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+ enum {
+ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+ DRR_WRITE, DRR_FREE, DRR_END,
+ } drr_type;
+ uint32_t drr_pad;
+ union {
+ struct drr_begin {
+ uint64_t drr_magic;
+ uint64_t drr_version;
+ uint64_t drr_creation_time;
+ dmu_objset_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_toguid;
+ uint64_t drr_fromguid;
+ char drr_toname[MAXNAMELEN];
+ } drr_begin;
+ struct drr_end {
+ uint64_t drr_checksum;
+ } drr_end;
+ struct drr_object {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ dmu_object_type_t drr_bonustype;
+ uint32_t drr_blksz;
+ uint32_t drr_bonuslen;
+ uint8_t drr_checksum;
+ uint8_t drr_compress;
+ uint8_t drr_pad[6];
+ } drr_object;
+ struct drr_freeobjects {
+ uint64_t drr_firstobj;
+ uint64_t drr_numobjs;
+ } drr_freeobjects;
+ struct drr_write {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ } drr_write;
+ struct drr_free {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ } drr_free;
+ } drr_u;
+} dmu_replay_record_t;
+
+typedef struct zfs_cmd {
+ char zc_name[MAXNAMELEN];
+ char zc_prop_name[MAXNAMELEN];
+ char zc_prop_value[MAXPATHLEN];
+ char zc_root[MAXPATHLEN];
+ char zc_filename[MAXPATHLEN];
+ uint32_t zc_intsz;
+ uint32_t zc_numints;
+ uint64_t zc_pool_guid;
+ uint64_t zc_config_src; /* really (char *) */
+ uint64_t zc_config_src_size;
+ uint64_t zc_config_dst; /* really (char *) */
+ uint64_t zc_config_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_cred;
+ uint64_t zc_dev;
+ uint64_t zc_volsize;
+ uint64_t zc_volblocksize;
+ uint64_t zc_objset_type;
+ zfs_stats_t zc_zfs_stats;
+ dmu_object_info_t zc_object_info;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+} zfs_cmd_t;
+
+#ifdef _KERNEL
+
+extern dev_info_t *zfs_dip;
+
+extern int zfs_secpolicy_write(const char *dataset, const char *, cred_t *cr);
+extern int zfs_busy(void);
+
+extern int zvol_check_volsize(zfs_cmd_t *zc);
+extern int zvol_check_volblocksize(zfs_cmd_t *zc);
+extern int zvol_get_stats(zfs_cmd_t *zc, objset_t *os);
+extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
+extern int zvol_create_minor(zfs_cmd_t *zc);
+extern int zvol_remove_minor(zfs_cmd_t *zc);
+extern int zvol_set_volsize(zfs_cmd_t *zc);
+extern int zvol_set_volblocksize(zfs_cmd_t *zc);
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 0000000000..cd0700f641
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_VFSOPS_H
+#define _SYS_FS_ZFS_VFSOPS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_delete_list {
+ kmutex_t z_mutex;
+ kcondvar_t z_cv;
+ kcondvar_t z_quiesce_cv;
+ uint8_t z_drained;
+ uint8_t z_draining;
+ uint32_t z_thread_target;
+ uint32_t z_thread_count;
+ uint64_t z_znode_count;
+ list_t z_znodes;
+} zfs_delete_t;
+
+typedef struct zfsvfs zfsvfs_t;
+
+struct zfsvfs {
+ vfs_t *z_vfs; /* generic fs struct */
+ zfsvfs_t *z_parent; /* parent fs */
+ objset_t *z_os; /* objset reference */
+ uint64_t z_root; /* id of root znode */
+ uint64_t z_dqueue; /* delete queue */
+ uint64_t z_max_blksz; /* maximum block size for files */
+ uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
+ zilog_t *z_log; /* intent log pointer */
+ uint_t z_acl_mode; /* acl chmod/mode behavior */
+ uint_t z_acl_inherit; /* acl inheritance behavior */
+ boolean_t z_atime; /* enable atimes mount option */
+ boolean_t z_unmounted1; /* unmounted phase 1 */
+ boolean_t z_unmounted2; /* unmounted phase 2 */
+ uint32_t z_op_cnt; /* vnode/vfs operations ref count */
+ krwlock_t z_um_lock; /* rw lock for umount phase 2 */
+ zfs_delete_t z_delete_head; /* zfs delete list */
+ list_t z_all_znodes; /* all vnodes in the fs */
+ kmutex_t z_znodes_lock; /* lock for z_all_znodes */
+ vnode_t *z_ctldir; /* .zfs directory pointer */
+ boolean_t z_show_ctldir; /* expose .zfs in the root dir */
+ boolean_t z_issnap; /* true if this is a snapshot */
+#define ZFS_OBJ_MTX_SZ 64
+ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
+};
+
+/*
+ * The total file ID size is limited to 12 bytes (including the length
+ * field) in the NFSv2 protocol. For historical reasons, this same limit
+ * is currently being imposed by the Solaris NFSv3 implementation...
+ * although the protocol actually permits a maximum of 64 bytes. It will
+ * not be possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ *
+ * For the time being, we will partition up the available space as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+ uint16_t zf_len;
+ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+typedef struct zfid_long {
+ zfid_short_t z_fid;
+ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
+#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 0000000000..d3f28df4cd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,283 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ZNODE_H
+#define _SYS_FS_ZFS_ZNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * special attributes for master node.
+ */
+
+#define ZFS_FSID "FSID"
+#define ZFS_DELETE_QUEUE "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZFS_VERSION_OBJ "VERSION"
+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
+
+#define ZFS_FLAG_BLOCKPERPAGE 0x1
+#define ZFS_FLAG_NOGROWBLOCKS 0x2
+
+/*
+ * ZFS version - rev'd whenever an incompatible on-disk format change
+ * occurs. Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define ZFS_VERSION 1ULL
+
+#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is. Unfortunately,
+ * this length includes the terminating NULL. ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
+
+/*
+ * This is the persistent portion of the znode. It is stored
+ * in the "bonus buffer" of the file. Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_pad[4]; /* 144 - future */
+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we only use this space to store symbolic links.
+ */
+} znode_phys_t;
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+ char *dl_name; /* directory entry being locked */
+ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
+ uint16_t dl_namesize; /* set if dl_name was allocated */
+ kcondvar_t dl_cv; /* wait for entry to be unlocked */
+ struct znode *dl_dzp; /* directory znode */
+ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+struct zcache_state;
+
+typedef struct znode {
+ struct zfsvfs *z_zfsvfs;
+ vnode_t *z_vnode;
+ list_node_t z_list_node; /* deleted znodes */
+ uint64_t z_id; /* object ID for this znode */
+ kmutex_t z_lock; /* znode modification lock */
+ krwlock_t z_map_lock; /* page map lock */
+ krwlock_t z_grow_lock; /* grow block size lock */
+ krwlock_t z_append_lock; /* append-mode lock */
+ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+ uint8_t z_active; /* znode is in use */
+ uint8_t z_reap; /* reap file at last reference */
+ uint8_t z_atime_dirty; /* atime needs to be synced */
+ uint8_t z_dbuf_held; /* Is z_dbuf already held? */
+ uint_t z_mapcnt; /* number of memory maps to file */
+ uint_t z_blksz; /* block size in bytes */
+ uint_t z_seq; /* modification sequence number */
+ uint64_t z_last_itx; /* last ZIL itx on this znode */
+ kmutex_t z_acl_lock; /* acl data lock */
+ list_node_t z_link_node; /* all znodes in fs link */
+ list_node_t z_zcache_node;
+ struct zcache_state *z_zcache_state;
+ uint64_t z_zcache_access;
+
+ /*
+ * These are dmu managed fields.
+ */
+ znode_phys_t *z_phys; /* pointer to persistent znode */
+ dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+} znode_t;
+
+/*
+ * The grow_lock is only applicable to "regular" files.
+ * The parent_lock is only applicable to directories.
+ */
+#define z_parent_lock z_grow_lock
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define ZTOV(ZP) ((ZP)->z_vnode)
+#define VTOZ(VP) ((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ */
+#define ZFS_ENTER(zfsvfs) \
+ { \
+ atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
+ if ((zfsvfs)->z_unmounted1) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
+#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1))
+#define ZFS_OBJ_MUTEX(zp) \
+ (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+ mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
+
+#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+ mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define ZFS_TIME_ENCODE(tp, stmp) \
+{ \
+ stmp[0] = (uint64_t)(tp)->tv_sec; \
+ stmp[1] = (uint64_t)(tp)->tv_nsec; \
+}
+
+#define ZFS_TIME_DECODE(tp, stmp) \
+{ \
+ (tp)->tv_sec = (time_t)stmp[0]; \
+ (tp)->tv_nsec = (long)stmp[1]; \
+}
+
+/*
+ * Timestamp defines
+ */
+#define ACCESSED (AT_ATIME)
+#define STATE_CHANGED (AT_CTIME)
+#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
+
+#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+ zfs_time_stamper(zp, ACCESSED, NULL)
+
+extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern void zfs_set_dataprop(objset_t *);
+extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern int zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
+ cred_t *cr);
+extern void zfs_znode_init(void);
+extern void zfs_znode_fini(void);
+extern znode_t *zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
+extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern void zfs_zinactive(znode_t *);
+extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void zfs_znode_free(znode_t *);
+extern int zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
+extern void zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
+extern void zfs_zcache_flush(zfsvfs_t *zfsvf);
+extern void zfs_remove_op_tables();
+extern int zfs_create_op_tables();
+extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
+
+extern uint64_t zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name);
+extern uint64_t zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link);
+extern uint64_t zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern uint64_t zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio);
+extern uint64_t zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len);
+extern uint64_t zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied);
+extern uint64_t zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace);
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 0000000000..a03dcc6bc9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_H
+#define _SYS_ZIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log. The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset. The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t). The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zit_pad[6];
+} zil_header_t;
+
+/*
+ * Log block trailer - structure at the end of the header and each log block
+ *
+ * The zit_bt contains a zbt_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zbt_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_trailer {
+ uint64_t zit_pad;
+ blkptr_t zit_next_blk; /* next block in chain */
+ uint64_t zit_nused; /* bytes in log block used */
+ zio_block_tail_t zit_bt; /* block trailer */
+} zil_trailer_t;
+
+#define ZIL_MIN_BLKSZ 4096
+#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
+#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define TX_CREATE 1 /* Create file */
+#define TX_MKDIR 2 /* Make directory */
+#define TX_MKXATTR 3 /* Make XATTR directory */
+#define TX_SYMLINK 4 /* Create symbolic link to a file */
+#define TX_REMOVE 5 /* Remove file */
+#define TX_RMDIR 6 /* Remove directory */
+#define TX_LINK 7 /* Create hard link to a file */
+#define TX_RENAME 8 /* Rename a file */
+#define TX_WRITE 9 /* File write */
+#define TX_TRUNCATE 10 /* Truncate a file */
+#define TX_SETATTR 11 /* Set file attributes */
+#define TX_ACL 12 /* Set acl */
+#define TX_MAX_TYPE 13 /* Max transaction type */
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ */
+typedef struct { /* common log record header */
+ uint64_t lrc_txtype; /* intent log transaction type */
+ uint64_t lrc_reclen; /* transaction record length */
+ uint64_t lrc_txg; /* dmu transaction group number */
+ uint64_t lrc_seq; /* intent log sequence number */
+} lr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* object id of directory */
+ uint64_t lr_foid; /* object id of created file object */
+ uint64_t lr_mode; /* mode of object */
+ uint64_t lr_uid; /* uid of object */
+ uint64_t lr_gid; /* gid of object */
+ uint64_t lr_gen; /* generation (txg of creation) */
+ uint64_t lr_crtime[2]; /* creation time */
+ uint64_t lr_rdev; /* rdev of object to create */
+ /* name of object to create follows this */
+ /* for symlinks, link content follows name */
+} lr_create_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ /* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ uint64_t lr_link_obj; /* obj id of link */
+ /* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_sdoid; /* obj id of source directory */
+ uint64_t lr_tdoid; /* obj id of target directory */
+ /* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to write */
+ uint64_t lr_offset; /* offset to write to */
+ uint64_t lr_length; /* user data length to write */
+ uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ blkptr_t lr_blkptr; /* spa block pointer for replay */
+ /* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id of file to truncate */
+ uint64_t lr_offset; /* offset to truncate from */
+ uint64_t lr_length; /* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to change attributes */
+ uint64_t lr_mask; /* mask of attributes to set */
+ uint64_t lr_mode; /* mode to set */
+ uint64_t lr_uid; /* uid to set */
+ uint64_t lr_gid; /* gid to set */
+ uint64_t lr_size; /* size to set */
+ uint64_t lr_atime[2]; /* access time */
+ uint64_t lr_mtime[2]; /* modification time */
+} lr_setattr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of acl entries */
+ /* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * ZFS intent log transaction structure
+ */
+typedef struct itx {
+ list_node_t itx_node; /* linkage on zl_itx_list */
+ void *itx_private; /* type-specific opaque data */
+ uint8_t itx_data_copied; /* TX_WRITE only: write data already */
+ /* copied into itx data buffer */
+ lr_t itx_lr; /* common part of log record */
+ /* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+ uint64_t txg);
+typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+ uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr);
+
+extern void zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void zil_init(void);
+extern void zil_fini(void);
+
+extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void zil_free(zilog_t *zilog);
+
+extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void zil_close(zilog_t *zilog);
+
+extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *));
+extern void zil_destroy(zilog_t *zilog);
+
+extern itx_t *zil_itx_create(int txtype, size_t lrsize);
+extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void zil_commit(zilog_t *zilog, uint64_t seq, int ioflag);
+
+extern void zil_claim(char *osname, void *txarg);
+extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_clean(zilog_t *zilog);
+
+extern int zil_suspend(zilog_t *zilog);
+extern void zil_resume(zilog_t *zilog);
+
+extern int zil_disable;
+extern int zil_always;
+extern int zil_purge;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 0000000000..6286fc5aa3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_IMPL_H
+#define _SYS_ZIL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum lwb_state_type {
+ UNWRITTEN, /* buffer yet to be written */
+ SEQ_INCOMPLETE, /* buffer written, but there's an unwritten buffer in */
+ /* the sequence before this */
+ SEQ_COMPLETE, /* no unwritten buffers before this */
+} lwb_state_t;
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+ zilog_t *lwb_zilog; /* back pointer to log struct */
+ blkptr_t lwb_blk; /* on disk address of this log blk */
+ int lwb_nused; /* # used bytes in buffer */
+ int lwb_sz; /* size of block and buffer */
+ char *lwb_buf; /* log write buffer */
+ uint64_t lwb_max_txg; /* highest txg in this lwb */
+ uint64_t lwb_seq; /* highest log record seq number */
+ txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
+ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
+ lwb_state_t lwb_state; /* buffer state */
+} lwb_t;
+
+/*
+ * [vdev, seq] element for use in flushing device write caches
+ */
+typedef struct zil_vdev {
+ uint64_t vdev; /* device written */
+ uint64_t seq; /* itx sequence */
+ list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */
+} zil_vdev_t;
+
+/*
+ * Stable storage intent log management structure. One per dataset.
+ */
+struct zilog {
+ kmutex_t zl_lock; /* protects most zilog_t fields */
+ struct dsl_pool *zl_dmu_pool; /* DSL pool */
+ spa_t *zl_spa; /* handle for read/write log */
+ zil_header_t *zl_header; /* log header buffer */
+ objset_t *zl_os; /* object set we're logging */
+ zil_get_data_t *zl_get_data; /* callback to get object content */
+ uint64_t zl_itx_seq; /* itx sequence number */
+ uint64_t zl_ss_seq; /* last tx on stable storage */
+ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
+ uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+ uint32_t zl_suspend; /* log suspend count */
+ kcondvar_t zl_cv_write; /* for waiting to write to log */
+ kcondvar_t zl_cv_seq; /* for committing a sequence */
+ uint8_t zl_stop_replay; /* don't replay any further */
+ uint8_t zl_stop_sync; /* for debugging */
+ uint8_t zl_writer; /* boolean: write setup in progress */
+ uint8_t zl_log_error; /* boolean: log write error */
+ list_t zl_itx_list; /* in-memory itx list */
+ uint64_t zl_itx_list_sz; /* total size of records on list */
+ uint64_t zl_prev_blk_sz; /* previous log block size */
+ list_t zl_lwb_list; /* in-flight log write list */
+ list_t zl_vdev_list; /* list of [vdev, seq] pairs */
+ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
+ avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ kmutex_t zl_destroy_lock; /* serializes zil_destroy() calls */
+};
+
+typedef struct zil_dva_node {
+ dva_t zn_dva;
+ avl_node_t zn_node;
+} zil_dva_node_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 0000000000..5d3227e546
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define _ZIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dkio.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+
+typedef struct zio_block_tail {
+ uint64_t zbt_magic; /* for validation, endianness */
+ zio_cksum_t zbt_cksum; /* 256-bit checksum */
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+#define ZIO_GET_DVA(zio) (&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
+#define ZIO_GET_IOSIZE(zio) \
+ (DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+ SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_block_tail_t zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+#define ZIO_PRIORITY_NOW (zio_priority_table[0])
+#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
+#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
+#define ZIO_PRIORITY_FREE (zio_priority_table[5])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
+#define ZIO_PRIORITY_TABLE_SIZE 10
+
+#define ZIO_FLAG_MUSTSUCCEED 0x0000
+#define ZIO_FLAG_CANFAIL 0x0001
+#define ZIO_FLAG_FAILFAST 0x0002
+#define ZIO_FLAG_CONFIG_HELD 0x0004
+
+#define ZIO_FLAG_DONT_CACHE 0x0010
+#define ZIO_FLAG_DONT_QUEUE 0x0020
+#define ZIO_FLAG_DONT_PROPAGATE 0x0040
+#define ZIO_FLAG_DONT_RETRY 0x0080
+
+#define ZIO_FLAG_PHYSICAL 0x0100
+#define ZIO_FLAG_IO_BYPASS 0x0200
+#define ZIO_FLAG_IO_REPAIR 0x0400
+#define ZIO_FLAG_SPECULATIVE 0x0800
+
+#define ZIO_FLAG_RESILVER 0x1000
+#define ZIO_FLAG_SCRUB 0x2000
+
+#define ZIO_FLAG_GANG_INHERIT \
+ (ZIO_FLAG_CANFAIL | \
+ ZIO_FLAG_FAILFAST | \
+ ZIO_FLAG_CONFIG_HELD | \
+ ZIO_FLAG_DONT_RETRY | \
+ ZIO_FLAG_IO_REPAIR | \
+ ZIO_FLAG_SPECULATIVE | \
+ ZIO_FLAG_RESILVER | \
+ ZIO_FLAG_SCRUB)
+
+#define ZIO_FLAG_VDEV_INHERIT \
+ (ZIO_FLAG_GANG_INHERIT | \
+ ZIO_FLAG_DONT_CACHE | \
+ ZIO_FLAG_PHYSICAL)
+
+/*
+ * We'll take the unused errno 'EBADE' (from the Convergent graveyard)
+ * to indicate checksum errors.
+ */
+#define ECKSUM EBADE
+
+typedef struct zio zio_t;
+typedef void zio_done_func_t(zio_t *zio);
+typedef struct zio_transform zio_transform_t;
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+struct zio {
+ /* Core information about this I/O */
+ zio_t *io_parent;
+ zio_t *io_root;
+ spa_t *io_spa;
+ int io_checksum;
+ int io_compress;
+ int io_dva_index;
+ uint64_t io_txg;
+ blkptr_t *io_bp;
+ blkptr_t io_bp_copy;
+ zio_t *io_child;
+ zio_t *io_sibling_prev;
+ zio_t *io_sibling_next;
+ zio_transform_t *io_transform_stack;
+
+ /* Callback info */
+ zio_done_func_t *io_done;
+ void *io_private;
+ blkptr_t io_bp_orig;
+
+ /* Data represented by this I/O */
+ void *io_data;
+ uint64_t io_size;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+ uint64_t io_offset;
+ uint64_t io_deadline;
+ uint64_t io_timestamp;
+ avl_node_t io_offset_node;
+ avl_node_t io_deadline_node;
+ avl_tree_t *io_vdev_tree;
+ zio_t *io_delegate_list;
+ zio_t *io_delegate_next;
+ zio_t *io_retry_next;
+ list_node_t io_pending;
+
+ /* Internal pipeline state */
+ int io_flags;
+ uint8_t io_type;
+ uint8_t io_stage;
+ uint8_t io_stalled;
+ uint8_t io_priority;
+ struct dk_callback io_dk_callback;
+ int io_cmd;
+ int io_retries;
+ int io_error;
+ uint32_t io_numerrors;
+ uint32_t io_pipeline;
+ uint32_t io_async_stages;
+ uint64_t io_children_notready;
+ uint64_t io_children_notdone;
+ void *io_waiter;
+ kmutex_t io_lock;
+ kcondvar_t io_cv;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_root(spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size,
+ blkptr_t *bp, uint64_t txg);
+extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+extern void zio_next_stage(zio_t *zio);
+extern void zio_next_stage_async(zio_t *zio);
+extern void zio_wait_children_done(zio_t *zio);
+
+/*
+ * Delegate I/O to a child vdev.
+ */
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+ uint64_t offset, void *data, uint64_t size, int type, int priority,
+ int flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+
+extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
+extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 0000000000..ba3dc48d28
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define _SYS_ZIO_CHECKSUM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
+ int ci_correctable; /* number of correctable bits */
+ int ci_zbt; /* uses zio block tail? */
+ char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t fletcher_2_native;
+extern zio_checksum_t fletcher_4_native;
+
+extern zio_checksum_t fletcher_2_byteswap;
+extern zio_checksum_t fletcher_4_byteswap;
+
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+ void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 0000000000..7eddf1e8d1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define _SYS_ZIO_COMPRESS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len);
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ zio_compress_func_t *ci_compress;
+ zio_decompress_func_t *ci_decompress;
+ char *ci_name;
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
+ void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
+extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 0000000000..0b2b07de29
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define _ZIO_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * I/O Groups: pipeline stage definitions.
+ */
+
+typedef enum zio_stage {
+ ZIO_STAGE_OPEN = 0, /* RWFCI */
+ ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
+
+ ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
+
+ ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
+ ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
+ ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
+ ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */
+
+ ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
+ ZIO_STAGE_DVA_FREE, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM, /* ---C- */
+
+ ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_READY, /* RWFCI */
+
+ ZIO_STAGE_DVA_TRANSLATE, /* RW--- */
+
+ ZIO_STAGE_VDEV_IO_SETUP, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+
+ ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
+
+ ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
+ ZIO_STAGE_READ_DECOMPRESS, /* R---- */
+
+ ZIO_STAGE_DONE /* RWFCI */
+} zio_stage_t;
+
+/*
+ * The stages for which there's some performance value in going async.
+ * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
+ */
+#define ZIO_ASYNC_PIPELINE_STAGES \
+ ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_READ_DECOMPRESS))
+
+#define ZIO_VDEV_IO_PIPELINE \
+ ((1U << ZIO_STAGE_VDEV_IO_SETUP) | \
+ (1U << ZIO_STAGE_VDEV_IO_START) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+
+#define ZIO_READ_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_READ_PIPELINE \
+ ((1U << ZIO_STAGE_DVA_TRANSLATE) | \
+ ZIO_READ_PHYS_PIPELINE)
+
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WRITE_COMMON_PIPELINE \
+ ((1U << ZIO_STAGE_DVA_TRANSLATE) | \
+ ZIO_WRITE_PHYS_PIPELINE)
+
+#define ZIO_WRITE_PIPELINE \
+ ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READ_GANG_MEMBERS))
+
+#define ZIO_REWRITE_PIPELINE \
+ ((1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_WRITE_ALLOCATE_PIPELINE \
+ ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_FREE_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+
+#define ZIO_FREE_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_FREE) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_CLAIM_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_CLAIM) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_IOCTL_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
+ ZIO_VDEV_IO_PIPELINE)
+
+#define ZIO_ERROR_PIPELINE_MASK \
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE
+
+struct zio_transform {
+ void *zt_data;
+ uint64_t zt_size;
+ uint64_t zt_bufsize;
+ zio_transform_t *zt_next;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
new file mode 100644
index 0000000000..81ab16cd3d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -0,0 +1,583 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(dsl_pool_t *dp);
+static void txg_quiesce_thread(dsl_pool_t *dp);
+static void txg_timelimit_thread(dsl_pool_t *dp);
+
+int txg_time = 5; /* max 5 seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+ rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(tx->tx_threads == 0);
+
+ rw_destroy(&tx->tx_suspend);
+
+ kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT(tx->tx_threads == 0);
+
+ tx->tx_threads = 3;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (secmax)
+ (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + secmax * hz);
+ else
+ cv_wait(cv, &tx->tx_sync_lock);
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT(tx->tx_threads == 3);
+ txg_wait_synced(dp, 0);
+
+ /*
+ * Wake all 3 sync threads (one per state) and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT(tx->tx_threads == 3);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_timeout_exit_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+ uint64_t txg;
+
+ mutex_enter(&tc->tc_lock);
+
+ txg = tx->tx_open_txg;
+ tc->tc_count[txg & TXG_MASK]++;
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tx_cpu locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to txg_exit().
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+}
+
+static void
+txg_sync_thread(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We sync when there's someone waiting on us, or the
+ * quiesce thread has handed off a txg to us.
+ */
+ while (!tx->tx_exiting &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ tx->tx_quiesced_txg == 0) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ rw_exit(&tx->tx_suspend);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ spa_sync(dp->dp_spa, txg);
+ mutex_enter(&tx->tx_sync_lock);
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ rw_exit(&tx->tx_suspend);
+ cv_broadcast(&tx->tx_sync_done_cv);
+ }
+}
+
+static void
+txg_quiesce_thread(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ tx->tx_quiesced_txg != 0))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiesced_txg = txg;
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_timelimit_thread(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ while (!tx->tx_exiting) {
+ uint64_t txg = tx->tx_open_txg + 1;
+
+ txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
+
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+
+ while (!tx->tx_exiting && tx->tx_open_txg < txg) {
+ dprintf("pushing out %llu\n", txg);
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+ }
+ txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ /* XXX some code paths suspend when they are already suspended! */
+ rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty(tl, t));
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = tl->tl_head[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ tn = tn->tn_next[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 0000000000..63bff0ae4b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+/* Keep the uberblock version in a varialbe so we can get at it with mdb */
+static uint64_t uberblock_version = UBERBLOCK_VERSION;
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (EINVAL);
+
+ if (ub->ub_version != UBERBLOCK_VERSION)
+ return (ENOTSUP);
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_version = UBERBLOCK_VERSION;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/unique.c b/usr/src/uts/common/fs/zfs/unique.c
new file mode 100644
index 0000000000..56fbddd78e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/unique.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = a;
+ const unique_t *unb = b;
+
+ if (una->un_value < unb->un_value)
+ return (-1);
+ if (una->un_value > unb->un_value)
+ return (+1);
+ return (0);
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+}
+
+uint64_t
+unique_create(void)
+{
+ return (unique_insert(0));
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
new file mode 100644
index 0000000000..990c690bff
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,1738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_disk_ops,
+ &vdev_file_ops,
+ &vdev_missing_ops,
+ NULL
+};
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
+ uint64_t csize;
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (vdev < rvd->vdev_children)
+ return (rvd->vdev_child[vdev]);
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_children == 0 && vd->vdev_guid == guid)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_zalloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc, c;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+ for (c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+
+ while (guid == 0)
+ guid = spa_get_random(-1ULL);
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+
+ mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&vd->vdev_io_pending, sizeof (zio_t),
+ offsetof(zio_t, io_pending));
+ mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+ space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ txg_list_create(&vd->vdev_ms_list,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+
+ return (vd);
+}
+
+/*
+ * Free a vdev_t that has been removed from service.
+ */
+static void
+vdev_free_common(vdev_t *vd)
+{
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_map);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dirty_lock);
+ list_destroy(&vd->vdev_io_pending);
+ mutex_destroy(&vd->vdev_io_lock);
+ cv_destroy(&vd->vdev_io_cv);
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+vdev_t *
+vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0;
+ vdev_t *vd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (NULL);
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (NULL);
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (NULL);
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (NULL);
+ }
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ &vd->vdev_ashift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl.smo_object);
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ return (vd);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+ /*
+ * It's possible to free a vdev that's been added to the dirty
+ * list when in the middle of spa_vdev_add(). Handle that case
+ * correctly here.
+ */
+ if (vd->vdev_is_dirty)
+ vdev_config_clean(vd);
+
+ /*
+ * Free all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd == vd->vdev_top)
+ vdev_metaslab_fini(vd);
+
+ ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+
+ vdev_free_common(vd);
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_mg->mg_vd = tvd;
+ tvd->vdev_ms = svd->vdev_ms;
+ tvd->vdev_smo = svd->vdev_smo;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_ms = NULL;
+ svd->vdev_smo = NULL;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ tvd->vdev_dirty[t] = svd->vdev_dirty[t];
+ svd->vdev_dirty[t] = 0;
+ }
+
+ if (svd->vdev_is_dirty) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ ASSERT(svd->vdev_io_retry == NULL);
+ ASSERT(list_is_empty(&svd->vdev_io_pending));
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops);
+
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+void
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ metaslab_class_t *mc = spa_metaslab_class_select(spa);
+ uint64_t c;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ space_map_obj_t *smo = vd->vdev_smo;
+ metaslab_t **mspp = vd->vdev_ms;
+
+ dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+ ASSERT(oldc <= newc);
+
+ vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
+ vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+ vd->vdev_ms_count = newc;
+
+ if (vd->vdev_mg == NULL) {
+ if (txg == 0) {
+ dmu_buf_t *db;
+ uint64_t *ms_array;
+
+ ms_array = kmem_zalloc(newc * sizeof (uint64_t),
+ KM_SLEEP);
+
+ dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
+ 0, newc * sizeof (uint64_t), ms_array);
+
+ for (c = 0; c < newc; c++) {
+ if (ms_array[c] == 0)
+ continue;
+ db = dmu_bonus_hold(spa->spa_meta_objset,
+ ms_array[c]);
+ dmu_buf_read(db);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(db->db_data, &vd->vdev_smo[c],
+ db->db_size);
+ ASSERT3U(vd->vdev_smo[c].smo_object, ==,
+ ms_array[c]);
+ dmu_buf_rele(db);
+ }
+ kmem_free(ms_array, newc * sizeof (uint64_t));
+ }
+ vd->vdev_mg = metaslab_group_create(mc, vd);
+ }
+
+ for (c = 0; c < oldc; c++) {
+ vd->vdev_smo[c] = smo[c];
+ vd->vdev_ms[c] = mspp[c];
+ mspp[c]->ms_smo = &vd->vdev_smo[c];
+ }
+
+ for (c = oldc; c < newc; c++)
+ metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
+ c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+ if (oldc != 0) {
+ kmem_free(smo, oldc * sizeof (*smo));
+ kmem_free(mspp, oldc * sizeof (*mspp));
+ }
+
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ uint64_t m;
+ uint64_t count = vd->vdev_ms_count;
+
+ if (vd->vdev_ms != NULL) {
+ for (m = 0; m < count; m++)
+ metaslab_fini(vd->vdev_ms[m]);
+ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+ }
+
+ if (vd->vdev_smo != NULL) {
+ kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
+ vd->vdev_smo = NULL;
+ }
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ int error;
+ vdev_knob_t *vk;
+ int c;
+ uint64_t osize = 0;
+ uint64_t asize, psize;
+ uint64_t ashift = -1ULL;
+
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+ vd->vdev_fault_arg >>= 1;
+ else
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+ for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
+ uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
+
+ *valp = vk->vk_default;
+ *valp = MAX(*valp, vk->vk_min);
+ *valp = MIN(*valp, vk->vk_max);
+ }
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_cache_init(vd);
+ vdev_queue_init(vd);
+ vd->vdev_cache_active = B_TRUE;
+ }
+
+ if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ return (ENXIO);
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+ dprintf("%s = %d, osize %llu, state = %d\n",
+ vdev_description(vd), error, osize, vd->vdev_state);
+
+ if (error) {
+ dprintf("%s in %s failed to open, error %d, aux %d\n",
+ vdev_description(vd),
+ vdev_description(vd->vdev_parent),
+ error,
+ vd->vdev_stat.vs_aux);
+
+ vd->vdev_state = VDEV_STATE_CANT_OPEN;
+ return (error);
+ }
+
+ vd->vdev_state = VDEV_STATE_HEALTHY;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
+ vd->vdev_state = VDEV_STATE_DEGRADED;
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vd->vdev_state = VDEV_STATE_CANT_OPEN;
+ vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+ return (EOVERFLOW);
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ } else {
+ if (osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vd->vdev_state = VDEV_STATE_CANT_OPEN;
+ vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+ return (EOVERFLOW);
+ }
+ psize = 0;
+ asize = osize;
+ }
+
+ vd->vdev_psize = psize;
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_ashift = ashift;
+ } else {
+ /*
+ * Make sure the alignment requirement hasn't increased.
+ */
+ if (ashift > vd->vdev_ashift) {
+ dprintf("%s: ashift grew\n", vdev_description(vd));
+ vd->vdev_state = VDEV_STATE_CANT_OPEN;
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure the device hasn't shrunk.
+ */
+ if (asize < vd->vdev_asize) {
+ dprintf("%s: device shrank\n", vdev_description(vd));
+ vd->vdev_state = VDEV_STATE_CANT_OPEN;
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ asize > vd->vdev_asize) {
+ dprintf("%s: device grew\n", vdev_description(vd));
+ vd->vdev_asize = asize;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
+
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_cache_active) {
+ vdev_cache_fini(vd);
+ vdev_queue_fini(vd);
+ vd->vdev_cache_active = B_FALSE;
+ }
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+}
+
+void
+vdev_reopen(vdev_t *vd, zio_t **rq)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c;
+
+ if (vd == rvd) {
+ ASSERT(rq == NULL);
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_reopen(rvd->vdev_child[c], NULL);
+ return;
+ }
+
+ /* only valid for top-level vdevs */
+ ASSERT3P(vd, ==, vd->vdev_top);
+
+ /*
+ * vdev_state can change when spa_config_lock is held as writer,
+ * or when it's held as reader and we're doing a vdev_reopen().
+ * To handle the latter case, we grab rvd's io_lock to serialize
+ * reopens. This ensures that there's never more than one vdev
+ * state changer active at a time.
+ */
+ mutex_enter(&rvd->vdev_io_lock);
+
+ mutex_enter(&vd->vdev_io_lock);
+ while (list_head(&vd->vdev_io_pending) != NULL)
+ cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
+ vdev_close(vd);
+ (void) vdev_open(vd);
+ if (rq != NULL) {
+ *rq = vd->vdev_io_retry;
+ vd->vdev_io_retry = NULL;
+ }
+ mutex_exit(&vd->vdev_io_lock);
+
+ /*
+ * Reassess root vdev's health.
+ */
+ rvd->vdev_state = VDEV_STATE_HEALTHY;
+ for (c = 0; c < rvd->vdev_children; c++) {
+ uint64_t state = rvd->vdev_child[c]->vdev_state;
+ rvd->vdev_state = MIN(rvd->vdev_state, state);
+ }
+
+ mutex_exit(&rvd->vdev_io_lock);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : ENXIO);
+ }
+
+ /*
+ * Recursively initialize all labels.
+ */
+ if ((error = vdev_label_init(vd, txg)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * The is the latter half of vdev_create(). It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+ /*
+ * Aim for roughly 200 metaslabs per vdev.
+ */
+ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+ vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+ /*
+ * Initialize the vdev's metaslabs.
+ */
+ vdev_metaslab_init(vd, txg);
+}
+
+void
+vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
+{
+ vdev_t *tvd = vd->vdev_top;
+
+ mutex_enter(&tvd->vdev_dirty_lock);
+ if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
+ tvd->vdev_dirty[txg & TXG_MASK] |= flags;
+ (void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
+ tvd, txg);
+ }
+ mutex_exit(&tvd->vdev_dirty_lock);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ mutex_enter(sm->sm_lock);
+ if (!space_map_contains(sm, txg, size))
+ space_map_add(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ int dirty;
+
+ /*
+ * Quick test without the lock -- covers the common case that
+ * there are no dirty time segments.
+ */
+ if (sm->sm_space == 0)
+ return (0);
+
+ mutex_enter(sm->sm_lock);
+ dirty = space_map_contains(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+
+ return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+ int c;
+
+ ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ /*
+ * We're successfully scrubbed everything up to scrub_txg.
+ * Therefore, excise all old DTLs up to that point, then
+ * fold in the DTLs for everything we couldn't scrub.
+ */
+ if (scrub_txg != 0) {
+ space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+ space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+ }
+ if (scrub_done)
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+ if (txg != 0) {
+ vdev_t *tvd = vd->vdev_top;
+ vdev_dirty(tvd, VDD_DTL, txg);
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+ }
+ return;
+ }
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+ space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ dmu_buf_t *db;
+ int error;
+
+ ASSERT(vd->vdev_children == 0);
+
+ if (smo->smo_object == 0)
+ return (0);
+
+ db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+ dmu_buf_read(db);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(db->db_data, smo, db->db_size);
+ dmu_buf_rele(db);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
+ spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ space_map_t *sm = &vd->vdev_dtl_map;
+ space_map_t smsync;
+ kmutex_t smlock;
+ avl_tree_t *t = &sm->sm_root;
+ space_seg_t *ss;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+
+ dprintf("%s in txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached) {
+ if (smo->smo_object != 0) {
+ int err = dmu_object_free(spa->spa_meta_objset,
+ smo->smo_object, tx);
+ ASSERT3U(err, ==, 0);
+ smo->smo_object = 0;
+ }
+ dmu_tx_commit(tx);
+ return;
+ }
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+ 0, smo->smo_objsize, tx);
+
+ mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+ space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+ &smlock);
+
+ mutex_enter(&smlock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
+ space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ smo->smo_objsize = 0;
+ smo->smo_alloc = smsync.sm_space;
+
+ space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
+ space_map_destroy(&smsync);
+
+ mutex_exit(&smlock);
+ mutex_destroy(&smlock);
+
+ db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db);
+
+ dmu_tx_commit(tx);
+}
+
+int
+vdev_load(vdev_t *vd, int import)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c, error;
+ nvlist_t *label;
+ uint64_t guid, state;
+
+ dprintf("loading %s\n", vdev_description(vd));
+
+ /*
+ * Recursively load all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+ return (error);
+
+ /*
+ * If this is a leaf vdev, make sure its agrees with its disk labels.
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+
+ if (vdev_is_dead(vd))
+ return (0);
+
+ /*
+ * XXX state transitions don't propagate to parent here.
+ * Also, merely setting the state isn't sufficient because
+ * it's not persistent; a vdev_reopen() would make us
+ * forget all about it.
+ */
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ dprintf("can't load label config\n");
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &guid) != 0 || guid != spa_guid(spa)) {
+ dprintf("bad or missing pool GUID (%llu)\n", guid);
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
+ guid != vd->vdev_guid) {
+ dprintf("bad or missing vdev guid (%llu != %llu)\n",
+ guid, vd->vdev_guid);
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ /*
+ * If we find a vdev with a matching pool guid and vdev guid,
+ * but the pool state is not active, it indicates that the user
+ * exported or destroyed the pool without affecting the config
+ * cache (if / was mounted readonly, for example). In this
+ * case, immediately return EBADF so the caller can remove it
+ * from the config.
+ */
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state)) {
+ dprintf("missing pool state\n");
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (state != POOL_STATE_ACTIVE &&
+ (!import || state != POOL_STATE_EXPORTED)) {
+ dprintf("pool state not active (%llu)\n", state);
+ nvlist_free(label);
+ return (EBADF);
+ }
+
+ nvlist_free(label);
+ }
+
+ /*
+ * If this is a top-level vdev, make sure its allocation parameters
+ * exist and initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top) {
+
+ if (vd->vdev_ms_array == 0 ||
+ vd->vdev_ms_shift == 0 ||
+ vd->vdev_ashift == 0 ||
+ vd->vdev_asize == 0) {
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (0);
+ }
+
+ vdev_metaslab_init(vd, 0);
+ }
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf) {
+ error = vdev_dtl_load(vd);
+ if (error) {
+ dprintf("can't load DTL for %s, error %d\n",
+ vdev_description(vd), error);
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+
+ dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+ while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_add_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ ASSERT(vd == vd->vdev_top);
+
+ if (vd->vdev_ms_array == 0)
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+
+ ASSERT(vd->vdev_ms_array != 0);
+
+ vdev_config_dirty(vd);
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+ uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
+ uint8_t dirty = *dirtyp;
+
+ mutex_enter(&vd->vdev_dirty_lock);
+ *dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
+ mutex_exit(&vd->vdev_dirty_lock);
+
+ dprintf("%s txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ if (dirty & VDD_ADD)
+ vdev_add_sync(vd, txg);
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
+ metaslab_sync(msp, txg);
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+void
+vdev_io_start(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_start(zio);
+}
+
+void
+vdev_io_done(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+ if (vd == NULL || vd->vdev_ops == NULL)
+ return ("<unknown>");
+
+ if (vd->vdev_path != NULL)
+ return (vd->vdev_path);
+
+ if (vd->vdev_parent == NULL)
+ return (spa_name(vd->vdev_spa));
+
+ return (vd->vdev_ops->vdev_op_type);
+}
+
+int
+vdev_online(spa_t *spa, const char *path)
+{
+ vdev_t *vd;
+
+ spa_config_enter(spa, RW_WRITER);
+
+ if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+ spa_config_exit(spa);
+ return (ENODEV);
+ }
+
+ dprintf("ONLINE: %s\n", vdev_description(vd));
+
+ vd->vdev_offline = B_FALSE;
+
+ /*
+ * Clear the error counts. The idea is that you expect to see all
+ * zeroes when everything is working, so if you've just onlined a
+ * device, you don't want to keep hearing about errors from before.
+ */
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ vdev_reopen(vd->vdev_top, NULL);
+
+ spa_config_exit(spa);
+
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+vdev_offline(spa_t *spa, const char *path)
+{
+ vdev_t *vd;
+
+ spa_config_enter(spa, RW_WRITER);
+
+ if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+ spa_config_exit(spa);
+ return (ENODEV);
+ }
+
+ dprintf("OFFLINE: %s\n", vdev_description(vd));
+
+ /*
+ * If this device's top-level vdev has a non-empty DTL,
+ * don't allow the device to be offlined.
+ *
+ * XXX -- we should make this more precise by allowing the offline
+ * as long as the remaining devices don't have any DTL holes.
+ */
+ if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
+ spa_config_exit(spa);
+ return (EBUSY);
+ }
+
+ /*
+ * Set this device to offline state and reopen its top-level vdev.
+ * If this action results in the top-level vdev becoming unusable,
+ * undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(vd->vdev_top, NULL);
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(vd->vdev_top, NULL);
+ spa_config_exit(spa);
+ return (EBUSY);
+ }
+
+ spa_config_exit(spa);
+
+ return (0);
+}
+
+int
+vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+{
+ vdev_t *vd;
+
+ spa_config_enter(spa, RW_WRITER);
+
+ if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+ spa_config_exit(spa);
+ return (ENODEV);
+ }
+
+ vd->vdev_fault_mode = mode;
+ vd->vdev_fault_mask = mask;
+ vd->vdev_fault_arg = arg;
+
+ spa_config_exit(spa);
+
+ return (0);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+ return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+ int error = 0;
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+ return (0);
+
+ if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+ return (0);
+
+ switch (vd->vdev_fault_mode) {
+ case VDEV_FAULT_RANDOM:
+ if (spa_get_random(vd->vdev_fault_arg) == 0)
+ error = EIO;
+ break;
+
+ case VDEV_FAULT_COUNT:
+ if ((int64_t)--vd->vdev_fault_arg <= 0)
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+ error = EIO;
+ break;
+ }
+
+ if (error != 0) {
+ dprintf("returning %d for type %d on %s state %d offset %llx\n",
+ error, zio->io_type, vdev_description(vd),
+ vd->vdev_state, zio->io_offset);
+ }
+
+ return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c, t;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ for (t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+ vs->vs_read_errors += cvs->vs_read_errors;
+ vs->vs_write_errors += cvs->vs_write_errors;
+ vs->vs_checksum_errors += cvs->vs_checksum_errors;
+ vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ vs->vs_scrub_errors += cvs->vs_scrub_errors;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ if (zio->io_error == 0) {
+ if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ if ((flags & ZIO_FLAG_IO_REPAIR) &&
+ zio->io_delegate_list == NULL) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
+ vs->vs_scrub_repaired += zio->io_size;
+ else
+ vs->vs_self_healed += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ if (!vdev_is_dead(vd)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (type == ZIO_TYPE_READ) {
+ if (zio->io_error == ECKSUM)
+ vs->vs_checksum_errors++;
+ else
+ vs->vs_read_errors++;
+ }
+ if (type == ZIO_TYPE_WRITE)
+ vs->vs_write_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (type == ZIO_TYPE_WRITE) {
+ if (txg == 0 || vd->vdev_children != 0)
+ return;
+ if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+ }
+ if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+ vdev_t *tvd = vd->vdev_top;
+ if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+ return;
+ vdev_dirty(tvd, VDD_DTL, txg);
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+ }
+ }
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+ int c;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (type == POOL_SCRUB_NONE) {
+ /*
+ * Update completion and end time. Leave everything else alone
+ * so we can report what happened during the previous scrub.
+ */
+ vs->vs_scrub_complete = complete;
+ vs->vs_scrub_end = gethrestime_sec();
+ } else {
+ vs->vs_scrub_type = type;
+ vs->vs_scrub_complete = 0;
+ vs->vs_scrub_examined = 0;
+ vs->vs_scrub_repaired = 0;
+ vs->vs_scrub_errors = 0;
+ vs->vs_scrub_start = gethrestime_sec();
+ vs->vs_scrub_end = 0;
+ }
+
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Report checksum errors that a vdev that didn't realize it made.
+ * This can happen, for example, when RAID-Z combinatorial reconstruction
+ * infers that one of its components returned bad data.
+ */
+void
+vdev_checksum_error(zio_t *zio, vdev_t *vd)
+{
+ dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+ vdev_description(vd));
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
+{
+ ASSERT(vd == vd->vdev_top);
+
+ do {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+ } while ((vd = vd->vdev_parent) != NULL);
+}
+
+/*
+ * Various knobs to tune a vdev.
+ */
+static vdev_knob_t vdev_knob[] = {
+ {
+ "cache_size",
+ "size of the read-ahead cache",
+ 0,
+ 1ULL << 30,
+ 10ULL << 20,
+ offsetof(struct vdev, vdev_cache.vc_size)
+ },
+ {
+ "cache_bshift",
+ "log2 of cache blocksize",
+ SPA_MINBLOCKSHIFT,
+ SPA_MAXBLOCKSHIFT,
+ 16,
+ offsetof(struct vdev, vdev_cache.vc_bshift)
+ },
+ {
+ "cache_max",
+ "largest block size to cache",
+ 0,
+ SPA_MAXBLOCKSIZE,
+ 1ULL << 14,
+ offsetof(struct vdev, vdev_cache.vc_max)
+ },
+ {
+ "min_pending",
+ "minimum pending I/Os to the disk",
+ 1,
+ 10000,
+ 2,
+ offsetof(struct vdev, vdev_queue.vq_min_pending)
+ },
+ {
+ "max_pending",
+ "maximum pending I/Os to the disk",
+ 1,
+ 10000,
+ 35,
+ offsetof(struct vdev, vdev_queue.vq_max_pending)
+ },
+ {
+ "agg_limit",
+ "maximum size of aggregated I/Os",
+ 0,
+ SPA_MAXBLOCKSIZE,
+ SPA_MAXBLOCKSIZE,
+ offsetof(struct vdev, vdev_queue.vq_agg_limit)
+ },
+ {
+ "time_shift",
+ "deadline = pri + (lbolt >> time_shift)",
+ 0,
+ 63,
+ 4,
+ offsetof(struct vdev, vdev_queue.vq_time_shift)
+ },
+ {
+ "ramp_rate",
+ "exponential I/O issue ramp-up rate",
+ 1,
+ 10000,
+ 2,
+ offsetof(struct vdev, vdev_queue.vq_ramp_rate)
+ },
+};
+
+vdev_knob_t *
+vdev_knob_next(vdev_knob_t *vk)
+{
+ if (vk == NULL)
+ return (vdev_knob);
+
+ if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
+ return (NULL);
+
+ return (vk);
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!vd->vdev_is_dirty) {
+ list_insert_head(&spa->spa_dirty_list, vd);
+ vd->vdev_is_dirty = B_TRUE;
+ }
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ ASSERT(vd->vdev_is_dirty);
+
+ list_remove(&vd->vdev_spa->spa_dirty_list, vd);
+ vd->vdev_is_dirty = B_FALSE;
+}
+
+/*
+ * Set a vdev's state, updating any parent's state as well.
+ */
+void
+vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+{
+ if (state == vd->vdev_state)
+ return;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ if (vd->vdev_parent != NULL) {
+ int c;
+ int degraded = 0, faulted = 0;
+ vdev_t *parent, *child;
+
+ parent = vd->vdev_parent;
+ for (c = 0; c < parent->vdev_children; c++) {
+ child = parent->vdev_child[c];
+ if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+ faulted++;
+ else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ degraded++;
+ }
+
+ vd->vdev_parent->vdev_ops->vdev_op_state_change(
+ vd->vdev_parent, faulted, degraded);
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 0000000000..e1e7c1a36f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 256 back-to-back 512-byte
+ * reads into a single 128k read followed by 255 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 128k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. It could also
+ * take advantage of semantic information about the I/O. And it could use
+ * something faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds vc_size.
+ */
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_offset < ve2->ve_offset)
+ return (-1);
+ if (ve1->ve_offset > ve2->ve_offset)
+ return (1);
+ return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_lastused < ve2->ve_lastused)
+ return (-1);
+ if (ve1->ve_lastused > ve2->ve_lastused)
+ return (1);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+ ASSERT(ve->ve_data != NULL);
+
+ dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+ vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+ ve->ve_hits, ve->ve_missed_update);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ zio_buf_free(ve->ve_data, vc->vc_blocksize);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (vc->vc_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) >
+ vc->vc_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL) {
+ dprintf("can't evict in %p, still filling\n", vc);
+ return (NULL);
+ }
+ ASSERT(ve->ve_hits != 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = lbolt;
+ ve->ve_data = zio_buf_alloc(vc->vc_blocksize);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+
+ if (ve->ve_lastused != lbolt) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = lbolt;
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = zio->io_private;
+ zio_t *dio;
+
+ ASSERT(zio->io_size == vc->vc_blocksize);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT(ve->ve_fill_io == zio);
+ ASSERT(ve->ve_offset == zio->io_offset);
+ ASSERT(ve->ve_data == zio->io_data);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+ vdev_cache_hit(vc, ve, dio);
+
+ if (zio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+
+ while ((dio = zio->io_delegate_list) != NULL) {
+ zio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = zio->io_error;
+ zio_next_stage(dio);
+ }
+}
+
+/*
+ * Read data from the cache. Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+ uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+ zio_t *fio;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (EINVAL);
+
+ if (zio->io_size > vc->vc_max)
+ return (EOVERFLOW);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1,
+ vc->vc_blocksize))
+ return (EXDEV);
+
+ ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (ESTALE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio->io_delegate_next = fio->io_delegate_list;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+ mutex_exit(&vc->vc_lock);
+ return (0);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_next_stage(zio);
+ return (0);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (ENOMEM);
+ }
+
+ fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+ ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
+ ZIO_PRIORITY_CACHE_FILL,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+ vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+
+ return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize);
+ uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize);
+ avl_index_t where;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ bcopy((char *)zio->io_data + start - io_start,
+ ve->ve_data + start - ve->ve_offset, end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+
+ vc->vc_blocksize = 1ULL << vc->vc_bshift;
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 0000000000..9255ecf03e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunddi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_disk_t *dvd;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ /*
+ * When opening a disk device, we want to preserve the user's original
+ * intent. We always want to open the device by the path the user gave
+ * us, even if it is one of multiple paths to the save device. But we
+ * also want to be able to survive disks being removed/recabled.
+ * Therefore the sequence of opening devices is:
+ *
+ * 1. Try opening the device by path.
+ *
+ * a. First append "s0" to see if this is a whole disk
+ * b. Fall back to path otherwise
+ *
+ * 2. If the devid of the device matches the stored value, return
+ * success.
+ *
+ * 3. Otherwise, the device may have moved. Try opening the device
+ * by the devid instead.
+ *
+ */
+ if (vd->vdev_devid != NULL) {
+ if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+ &dvd->vd_minor) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ }
+
+ error = EINVAL; /* presume failure */
+
+ if (vd->vdev_path != NULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+ ddi_devid_t devid;
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ /*
+ * Try whole disk first, then slice name.
+ */
+ if ((error = ldi_open_by_name(buf, spa_mode, kcred,
+ &dvd->vd_lh, zfs_li)) != 0)
+ error = ldi_open_by_name(vd->vdev_path,
+ spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+ kmem_free(buf, len);
+
+ /*
+ * Compare the devid to the stored value.
+ */
+ if (error == 0 && vd->vdev_devid != NULL &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ error = EINVAL;
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+ dvd->vd_lh = NULL;
+ }
+ ddi_devid_free(devid);
+ }
+ }
+
+ /*
+ * If we were unable to open by path, or the devid check fails, open by
+ * devid instead.
+ */
+ if (error != 0 && vd->vdev_devid != NULL)
+ error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+ spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EINVAL);
+ }
+
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (dvd == NULL)
+ return;
+
+ dprintf("removing disk %s, devid %s\n",
+ vd->vdev_path ? vd->vdev_path : "<none>",
+ vd->vdev_devid ? vd->vdev_devid : "<none>");
+
+ if (dvd->vd_minor != NULL)
+ ddi_devid_str_free(dvd->vd_minor);
+
+ if (dvd->vd_devid != NULL)
+ ddi_devid_free(dvd->vd_devid);
+
+ if (dvd->vd_lh != NULL)
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+ vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+ zio_t *zio = vdb->vdb_io;
+
+ if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+ zio->io_error = EIO;
+
+ kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+ zio_t *zio = zio_arg;
+
+ zio->io_error = error;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_buf_t *vdb;
+ buf_t *bp;
+ int flags, error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+
+ zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+ zio->io_dk_callback.dkc_cookie = zio;
+
+ error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+ (uintptr_t)&zio->io_dk_callback,
+ FKIOCTL, kcred, NULL);
+
+ if (error == 0) {
+ /*
+ * The ioctl will be done asychronously,
+ * and will call vdev_disk_ioctl_done()
+ * upon completion.
+ */
+ return;
+ }
+ zio->io_error = error;
+ break;
+
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+ flags |= B_BUSY | B_NOCACHE;
+ if (zio->io_flags & ZIO_FLAG_FAILFAST)
+ flags |= B_FAILFAST;
+
+ vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+ vdb->vdb_io = zio;
+ bp = &vdb->vdb_buf;
+
+ bioinit(bp);
+ bp->b_flags = flags;
+ bp->b_bcount = zio->io_size;
+ bp->b_un.b_addr = zio->io_data;
+ bp->b_lblkno = lbtodb(zio->io_offset);
+ bp->b_bufsize = zio->io_size;
+ bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ bioerror(bp, error);
+ bp->b_resid = bp->b_bcount;
+ bp->b_iodone(bp);
+ return;
+ }
+
+ error = ldi_strategy(dvd->vd_lh, bp);
+ /* ldi_strategy() will return non-zero only on programming errors */
+ ASSERT(error == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_disk_open,
+ vdev_disk_close,
+ vdev_default_asize,
+ vdev_disk_io_start,
+ vdev_disk_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 0000000000..a789008e17
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ vattr_t vattr;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+#ifdef _KERNEL
+ /*
+ * When using a file vdev in kernel context, the underlying filesystem
+ * will already be caching the data. Don't cache it again here.
+ */
+ vd->vdev_cache.vc_size = 0;
+#endif
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
+ 0, &vp, 0, 0, rootdir);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (ENODEV);
+ }
+#endif
+
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, kcred);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *psize = vattr.va_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+ VN_RELE(vf->vf_vnode);
+ }
+
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred);
+ dprintf("fsync(%s) = %d\n", vdev_description(vd),
+ zio->io_error);
+ break;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+ zio->io_size, zio->io_offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_file_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 0000000000..6671a68fa9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transacation groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(vdev_t *vd, int getstats)
+{
+ nvlist_t *nv = NULL;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ vd->vdev_ops->vdev_op_type) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+ if (vd->vdev_path != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+ vd->vdev_path) == 0);
+
+ if (vd->vdev_devid != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+ vd->vdev_devid) == 0);
+
+ if (vd == vd->vdev_top) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize) == 0);
+ }
+
+ if (vd->vdev_dtl.smo_object != 0)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl.smo_object) == 0);
+
+ if (getstats) {
+ vdev_stat_t vs;
+ vdev_get_stats(vd, &vs);
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c;
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ child[c] = vdev_config_generate(vd->vdev_child[c],
+ getstats);
+
+ VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, vd->vdev_children) == 0);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+ }
+
+ return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp;
+ uint64_t version;
+ zio_t *zio;
+ int l;
+
+ if (vdev_is_dead(vd))
+ return (NULL);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ zio = zio_root(vd->vdev_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+
+ vdev_label_read(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ if (zio_wait(zio) == 0 &&
+ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+ &config, 0) == 0 &&
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0 &&
+ version == UBERBLOCK_VERSION)
+ break;
+
+ if (config != NULL) {
+ nvlist_free(config);
+ config = NULL;
+ }
+ }
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ return (config);
+}
+
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ vdev_boot_header_t *vb;
+ uberblock_phys_t *ubphys;
+ zio_t *zio;
+ int l, c, n;
+ char *buf;
+ size_t buflen;
+ int error;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c], crtxg)) != 0)
+ return (error);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ /*
+ * Make sure each leaf device is writable, and zero its initial content.
+ * Along the way, also make sure that no leaf is already in use.
+ * Note that it's important to do this sequentially, not in parallel,
+ * so that we catch cases of multiple use of the same leaf vdev in
+ * the vdev we're creating -- e.g. mirroring a disk with itself.
+ */
+ if (vdev_is_dead(vd))
+ return (EIO);
+
+ /*
+ * Check whether this device is already in use.
+ * Ignore the check if crtxg == 0, which we use for device removal.
+ */
+ if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
+ uint64_t version, state, pool_guid, device_guid, txg;
+ uint64_t mycrtxg = 0;
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &mycrtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
+ &version) == 0 && version == UBERBLOCK_VERSION &&
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) == 0 && state == POOL_STATE_ACTIVE &&
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) == 0 &&
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) == 0 &&
+ spa_guid_exists(pool_guid, device_guid) &&
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) == 0 && (txg != 0 || mycrtxg == crtxg)) {
+ dprintf("vdev %s in use, pool_state %d\n",
+ vdev_description(vd), state);
+ nvlist_free(label);
+ return (EBUSY);
+ }
+ nvlist_free(label);
+ }
+
+ /*
+ * The device isn't in use, so initialize its label.
+ */
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ label = spa_config_generate(spa, vd, 0ULL, 0);
+
+ /*
+ * Add our creation time. This allows us to detect multiple vdev
+ * uses as described above, and automatically expires if we fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0);
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+ nvlist_free(label);
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ return (EINVAL);
+ }
+
+ /*
+ * Initialize boot block header.
+ */
+ vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+ bzero(vb, sizeof (vdev_boot_header_t));
+ vb->vb_magic = VDEV_BOOT_MAGIC;
+ vb->vb_version = VDEV_BOOT_VERSION;
+ vb->vb_offset = VDEV_BOOT_OFFSET;
+ vb->vb_size = VDEV_BOOT_SIZE;
+
+ /*
+ * Initialize uberblock template.
+ */
+ ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+ bzero(ubphys, sizeof (uberblock_phys_t));
+ ubphys->ubp_uberblock = spa->spa_uberblock;
+ ubphys->ubp_uberblock.ub_txg = 0;
+
+ /*
+ * Write everything in parallel.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ vdev_label_write(zio, vd, l, vb,
+ offsetof(vdev_label_t, vl_boot_header),
+ sizeof (vdev_boot_header_t), NULL, NULL);
+
+ for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+
+ vdev_label_write(zio, vd, l, ubphys,
+ offsetof(vdev_label_t, vl_uberblock[n]),
+ sizeof (uberblock_phys_t), NULL, NULL);
+
+ }
+ }
+
+ error = zio_wait(zio);
+
+ nvlist_free(label);
+ zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+ zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+ if (ub1->ub_txg < ub2->ub_txg)
+ return (-1);
+ if (ub1->ub_txg > ub2->ub_txg)
+ return (1);
+
+ if (ub1->ub_timestamp < ub2->ub_timestamp)
+ return (-1);
+ if (ub1->ub_timestamp > ub2->ub_timestamp)
+ return (1);
+
+ return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ uberblock_phys_t *ubphys = zio->io_data;
+ uberblock_t *ub = &ubphys->ubp_uberblock;
+ uberblock_t *ubbest = zio->io_private;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
+
+ if (uberblock_verify(ub) == 0) {
+ mutex_enter(&spa->spa_uberblock_lock);
+ if (vdev_uberblock_compare(ub, ubbest) > 0)
+ *ubbest = *ub;
+ mutex_exit(&spa->spa_uberblock_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+ vdev_label_read(zio, vd, l,
+ zio_buf_alloc(sizeof (uberblock_phys_t)),
+ offsetof(vdev_label_t, vl_uberblock[n]),
+ sizeof (uberblock_phys_t),
+ vdev_uberblock_load_done, ubbest);
+ }
+ }
+}
+
+/*
+ * Write the uberblock to both labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd,
+ uint64_t txg)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ n = txg & (VDEV_UBERBLOCKS - 1);
+
+ ASSERT(ubphys->ubp_uberblock.ub_txg == txg);
+
+ for (l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ubphys,
+ offsetof(vdev_label_t, vl_uberblock[n]),
+ sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL);
+
+ dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+}
+
+static int
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg)
+{
+ uberblock_phys_t *ubphys;
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+ bzero(ubphys, sizeof (uberblock_phys_t));
+ ubphys->ubp_uberblock = *ub;
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ vdev_uberblock_sync(zio, ubphys, uvd, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ /*
+ * It's possible to have no good writes and no error if every vdev is in
+ * the CANT_OPEN state.
+ */
+ if (*good_writes == 0 && error == 0)
+ error = EIO;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+ zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+
+ return (error);
+}
+
+/*
+ * Sync out an individual vdev.
+ */
+static void
+vdev_sync_label_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ char *buf;
+ size_t buflen;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, 0);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+ vdev_sync_label_done, NULL);
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ nvlist_free(label);
+
+ dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
+}
+
+static int
+vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+{
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ASSERT(vd == vd->vdev_top);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(vd->vdev_spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ /*
+ * Recursively kick off writes to all labels.
+ */
+ vdev_sync_label(zio, vd, l, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ if (*good_writes == 0 && error == 0)
+ error = ENODEV;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+
+ return (error);
+}
+
+/*
+ * Sync the entire vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+spa_sync_labels(spa_t *spa, uint64_t txg)
+{
+ uberblock_t *ub = &spa->spa_uberblock;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *uvd;
+ zio_t *zio;
+ int c, l, error;
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors, and nothing changed
+ * in this transaction group, and the vdev configuration hasn't changed,
+ * and this isn't an explicit sync-all, then there's nothing to do.
+ */
+ if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
+ list_is_empty(&spa->spa_dirty_list)) {
+ dprintf("nothing to sync in %s in txg %llu\n",
+ spa_name(spa), txg);
+ return (0);
+ }
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (l & 1)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all even-label
+ * updates are committed to stable storage before the uberblock update.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * If there are any dirty vdevs, sync the uberblock to all vdevs.
+ * Otherwise, pick one top-level vdev at random.
+ */
+ if (!list_is_empty(&spa->spa_dirty_list))
+ uvd = rvd;
+ else
+ uvd = rvd->vdev_child[spa_get_random(rvd->vdev_children)];
+
+ /*
+ * Sync the uberblocks. If the system dies in the middle of this
+ * step, there are two cases to consider, and the on-disk state
+ * is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+ return (error);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if ((l & 1) == 0)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all odd-label
+ * updates are committed to stable storage before the next
+ * transaction group begins.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Clear the dirty list.
+ */
+ while (!list_is_empty(&spa->spa_dirty_list))
+ vdev_config_clean(list_head(&spa->spa_dirty_list));
+
+#ifdef DEBUG
+ for (c = 0; c < rvd->vdev_children; c++) {
+ ASSERT(rvd->vdev_child[c]->vdev_is_dirty == 0);
+ }
+#endif
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 0000000000..45eb7ce78b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,414 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_map {
+ int mm_error;
+ short mm_tried;
+ short mm_skipped;
+} mirror_map_t;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+ zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
+ sizeof (mirror_map_t), KM_SLEEP);
+ return (zio->io_vsd);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ kmem_free(zio->io_vsd,
+ zio->io_vd->vdev_children * sizeof (mirror_map_t));
+ zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t c;
+ int numerrors = 0;
+ int ret, lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((ret = vdev_open(cvd)) != 0) {
+ lasterror = ret;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = cvd->vdev_ashift;
+ }
+
+ if (numerrors == vd->vdev_children) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_private;
+
+ mm->mm_error = zio->io_error;
+ mm->mm_tried = 1;
+ mm->mm_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio = zio->io_parent;
+ mutex_enter(&pio->io_lock);
+ bcopy(zio->io_data, pio->io_data, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+
+ mm->mm_error = zio->io_error;
+ mm->mm_tried = 1;
+ mm->mm_skipped = 0;
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ uint64_t txg = zio->io_txg;
+ int i, c;
+
+ ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+ /*
+ * Select the child we'd like to read from absent any errors.
+ * The current policy is to alternate sides at 8M granularity.
+ * XXX -- investigate other policies for read distribution.
+ */
+ c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
+
+ /*
+ * If this is a replacing vdev, always try child 0 (the source) first.
+ */
+ if (vd->vdev_ops == &vdev_replacing_ops)
+ c = 0;
+
+ /*
+ * Try to find a child whose DTL doesn't contain the block to read.
+ * If a child is known to be completely inaccessible (indicated by
+ * vdev_is_dead() returning B_TRUE), don't even try.
+ */
+ for (i = 0; i < vd->vdev_children; i++, c++) {
+ if (c >= vd->vdev_children)
+ c = 0;
+ if (mm[c].mm_tried || mm[c].mm_skipped)
+ continue;
+ cvd = vd->vdev_child[c];
+ if (vdev_is_dead(cvd)) {
+ mm[c].mm_error = ENXIO;
+ mm[c].mm_tried = 1; /* don't even try */
+ mm[c].mm_skipped = 1;
+ continue;
+ }
+ if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+ return (c);
+ mm[c].mm_error = ESTALE;
+ mm[c].mm_skipped = 1;
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * If we don't have any sibling replicas to consult, look for
+ * any child we haven't already tried before giving up.
+ */
+ if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ if (!mm[c].mm_tried)
+ return (c);
+ }
+ }
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mirror_map_t *mm;
+ int c, children;
+
+ mm = vdev_mirror_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_flags & ZIO_FLAG_SCRUB) {
+ /*
+ * For scrubbing reads we need to allocate a read
+ * buffer for each child and issue reads to all
+ * children. If any child succeeds, it will copy its
+ * data into zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ vd->vdev_child[c], zio->io_offset,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
+ &mm[c]));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * If this is a resilvering I/O to a replacing vdev,
+ * only the last child should be written -- unless the
+ * first child happens to have a DTL entry here as well.
+ * All other writes go to all children.
+ */
+ if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
+ vd->vdev_ops == &vdev_replacing_ops &&
+ !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+ zio->io_txg, 1)) {
+ c = vd->vdev_children - 1;
+ children = 1;
+ } else {
+ c = 0;
+ children = vd->vdev_children;
+ }
+ }
+
+ while (children--) {
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ vd->vdev_child[c], zio->io_offset, zio->io_data,
+ zio->io_size, zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+ c++;
+ }
+
+ zio_wait_children_done(zio);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ mirror_map_t *mm = zio->io_vsd;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ ASSERT(mm != NULL);
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ if (mm[c].mm_tried && mm[c].mm_error == 0) {
+ good_copies++;
+ continue;
+ }
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (mm[c].mm_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = mm[c].mm_error;
+ if (!mm[c].mm_skipped)
+ unexpected_errors++;
+ zio->io_numerrors++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ */
+ /* XXPOLICY */
+ if (good_copies != 0)
+ zio->io_error = 0;
+ ASSERT(mm != NULL);
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < vd->vdev_children);
+ cvd = vd->vdev_child[c];
+ dprintf("%s: retrying i/o (err=%d) on child %s\n",
+ vdev_description(zio->io_vd), zio->io_error,
+ vdev_description(cvd));
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+ zio->io_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_mirror_child_done, &mm[c]));
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /* XXPOLICY */
+ if (good_copies)
+ zio->io_error = 0;
+ else
+ ASSERT(zio->io_error != 0);
+
+ if (good_copies && (spa_mode & FWRITE) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ cvd = vd->vdev_child[c];
+
+ if (mm[c].mm_error == 0) {
+ if (mm[c].mm_tried)
+ continue;
+ if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+ zio->io_txg, 1))
+ continue;
+ mm[c].mm_error = ESTALE;
+ }
+
+ dprintf("%s resilvered %s @ 0x%llx error %d\n",
+ vdev_description(vd),
+ vdev_description(cvd),
+ zio->io_offset, mm[c].mm_error);
+
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+ zio->io_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+ }
+ }
+
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children)
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_MIRROR, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_REPLACING, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 0000000000..b35f4a5bcd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = SPA_MINDEVSIZE;
+ *ashift = SPA_MINBLOCKSHIFT;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = ENOTSUP;
+ zio_next_stage_async(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_missing_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_MISSING, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 0000000000..09831e1504
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_deadline < z2->io_deadline)
+ return (-1);
+ if (z1->io_deadline > z2->io_deadline)
+ return (1);
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+ sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ avl_destroy(&vq->vq_deadline_tree);
+ avl_destroy(&vq->vq_read_tree);
+ avl_destroy(&vq->vq_write_tree);
+ avl_destroy(&vq->vq_pending_tree);
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ zio_t *dio;
+ uint64_t offset = 0;
+
+ while ((dio = aio->io_delegate_list) != NULL) {
+ if (aio->io_type == ZIO_TYPE_READ)
+ bcopy((char *)aio->io_data + offset, dio->io_data,
+ dio->io_size);
+ offset += dio->io_size;
+ aio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = aio->io_error;
+ zio_next_stage(dio);
+ }
+ ASSERT3U(offset, ==, aio->io_size);
+
+ zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define IS_ADJACENT(io, nio) \
+ ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+ zio_issue_func_t **funcp)
+{
+ zio_t *fio, *lio, *aio, *dio;
+ avl_tree_t *tree;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ *funcp = NULL;
+
+ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+ avl_numnodes(&vq->vq_deadline_tree) == 0)
+ return (NULL);
+
+ fio = lio = avl_first(&vq->vq_deadline_tree);
+
+ tree = fio->io_vdev_tree;
+ size = fio->io_size;
+
+ while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+ size + dio->io_size <= vq->vq_agg_limit) {
+ dio->io_delegate_next = fio;
+ fio = dio;
+ size += dio->io_size;
+ }
+
+ while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+ size + dio->io_size <= vq->vq_agg_limit) {
+ lio->io_delegate_next = dio;
+ lio = dio;
+ size += dio->io_size;
+ }
+
+ if (fio != lio) {
+ char *buf = zio_buf_alloc(size);
+ uint64_t offset = 0;
+ int nagg = 0;
+
+ ASSERT(size <= vq->vq_agg_limit);
+
+ aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+ fio->io_offset, buf, size, fio->io_type,
+ ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+ vdev_queue_agg_io_done, NULL);
+
+ aio->io_delegate_list = fio;
+
+ for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+ ASSERT(dio->io_type == aio->io_type);
+ if (dio->io_type == ZIO_TYPE_WRITE)
+ bcopy(dio->io_data, buf + offset, dio->io_size);
+ offset += dio->io_size;
+ avl_remove(&vq->vq_deadline_tree, dio);
+ avl_remove(tree, dio);
+ zio_vdev_io_bypass(dio);
+ nagg++;
+ }
+
+ ASSERT(offset == size);
+
+ dprintf("%5s T=%llu off=%8llx agg=%3d "
+ "old=%5llx new=%5llx\n",
+ zio_type_name[fio->io_type],
+ fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+ avl_add(&vq->vq_pending_tree, aio);
+
+ *funcp = zio_nowait;
+ return (aio);
+ }
+
+ avl_remove(&vq->vq_deadline_tree, fio);
+ avl_remove(tree, fio);
+
+ avl_add(&vq->vq_pending_tree, fio);
+
+ *funcp = zio_next_stage;
+
+ return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vdev_tree = &vq->vq_read_tree;
+ else
+ zio->io_vdev_tree = &vq->vq_write_tree;
+
+ mutex_enter(&vq->vq_lock);
+
+ zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
+ zio->io_priority;
+
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+
+ nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
+
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL || func != zio_nowait)
+ return (nio);
+
+ func(nio);
+ return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+ int i;
+
+ mutex_enter(&vq->vq_lock);
+
+ avl_remove(&vq->vq_pending_tree, zio);
+
+ for (i = 0; i < vq->vq_ramp_rate; i++) {
+ nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func);
+ if (nio == NULL)
+ break;
+ mutex_exit(&vq->vq_lock);
+ if (func == zio_next_stage)
+ zio_vdev_io_reissue(nio);
+ func(nio);
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 0000000000..54547a3c97
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,599 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ */
+
+/*
+ * We currently allow up to two-way replication (i.e. single-fault
+ * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs
+ * must all be multiples of two times the leaf vdev blocksize.
+ */
+#define VDEV_RAIDZ_ALIGN 2ULL
+
+typedef struct raidz_col {
+ uint64_t rc_col;
+ uint64_t rc_offset;
+ uint64_t rc_size;
+ void *rc_data;
+ int rc_error;
+ short rc_tried;
+ short rc_skipped;
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols;
+ uint64_t rm_bigcols;
+ uint64_t rm_asize;
+ int rm_missing_child;
+ int rm_type;
+ int rm_firstdatacol;
+ raidz_col_t rm_col[1];
+} raidz_map_t;
+
+#define RAIDZ_SINGLE 0
+#define RAIDZ_PARITY 1
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+ int raid_type)
+{
+ raidz_map_t *rm;
+ uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t s = zio->io_size >> unit_shift;
+ uint64_t f = b % dcols;
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, coff;
+ int firstdatacol;
+
+ switch (raid_type) {
+ case RAIDZ_SINGLE:
+ q = s / dcols;
+ r = s - q * dcols;
+ bc = r;
+ firstdatacol = 0;
+ break;
+ case RAIDZ_PARITY:
+ q = s / (dcols - 1);
+ r = s - q * (dcols - 1);
+ bc = r + !!r;
+ firstdatacol = 1;
+ break;
+ }
+
+ acols = (q == 0 ? bc : dcols);
+
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_bigcols = bc;
+ rm->rm_asize = 0;
+ rm->rm_missing_child = -1;
+ rm->rm_type = raid_type;
+ rm->rm_firstdatacol = firstdatacol;
+
+ for (c = 0; c < acols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_col = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+ rm->rm_asize += rm->rm_col[c].rc_size;
+ }
+
+ rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+ rm->rm_col[c].rc_data = zio->io_data;
+
+ for (c = c + 1; c < acols; c++)
+ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+
+ if (raid_type == RAIDZ_PARITY) {
+ /*
+ * To prevent hot parity disks, switch the parity and data
+ * columns every 1MB.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (zio->io_offset & (1ULL << 20)) {
+ col = rm->rm_col[0].rc_col;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_col = col;
+ rm->rm_col[1].rc_offset = o;
+ }
+ }
+
+ zio->io_vsd = rm;
+ return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ int c;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_reconstruct(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, count, xsize, csize;
+ int i, c;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (c == x)
+ continue;
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ csize = rm->rm_col[c].rc_size;
+ xsize = rm->rm_col[x].rc_size;
+ count = MIN(csize, xsize) / sizeof (uint64_t);
+ if (c == !x) {
+ /*
+ * The initial copy happens at either c == 0 or c == 1.
+ * Both of these columns are 'big' columns, so we'll
+ * definitely initialize all of column x.
+ */
+ ASSERT3U(xsize, <=, csize);
+ for (i = 0; i < count; i++)
+ *dst++ = *src++;
+ } else {
+ for (i = 0; i < count; i++)
+ *dst++ ^= *src++;
+ }
+ }
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ /*
+ * XXX -- minimum children should be raid-type-specific
+ */
+ if (vd->vdev_children < 2) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = cvd->vdev_ashift;
+ }
+
+ *asize *= vd->vdev_children;
+
+ if (numerrors > 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize;
+ uint64_t cols = vd->vdev_children;
+
+ /*
+ * These calculations assume RAIDZ_PARITY.
+ */
+ asize = psize >> vd->vdev_ashift;
+ asize += (asize + cols - 2) / (cols - 1);
+ asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
+
+ return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+ zio_buf_free(zio->io_data, zio->io_size);
+}
+
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ blkptr_t *bp = zio->io_bp;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c;
+
+ rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children,
+ RAIDZ_PARITY);
+
+ if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
+ ASSERT3U(rm->rm_asize, ==,
+ vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+ } else {
+ ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+
+ /*
+ * Generate RAID parity in virtual column 0.
+ */
+ vdev_raidz_reconstruct(rm, 0);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_col];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_col];
+ if (vdev_is_dead(cvd)) {
+ rm->rm_missing_child = c;
+ rc->rc_error = ENXIO;
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+ rm->rm_missing_child = c;
+ rc->rc_error = ESTALE;
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
+ (zio->io_flags & ZIO_FLAG_SCRUB)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ }
+
+ zio_wait_children_done(zio);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ blkptr_t *bp = zio->io_bp;
+ int unexpected_errors = 0;
+ int c;
+
+ ASSERT(bp != NULL); /* XXX need to add code to enforce this */
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (rc->rc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = rc->rc_error;
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+ zio->io_numerrors++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If this is not a failfast write, and we were able to
+ * write enough columns to reconstruct the data, good enough.
+ */
+ /* XXPOLICY */
+ if (zio->io_numerrors <= rm->rm_firstdatacol &&
+ !(zio->io_flags & ZIO_FLAG_FAILFAST))
+ zio->io_error = 0;
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If there were no I/O errors, and the data checksums correctly,
+ * the read is complete.
+ */
+ /* XXPOLICY */
+ if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
+ ASSERT(unexpected_errors == 0);
+ ASSERT(zio->io_error == 0);
+
+ /*
+ * We know the data's good. If we read the parity,
+ * verify that it's good as well. If not, fix it.
+ */
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried)
+ continue;
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct(rm, c);
+ if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
+ vdev_checksum_error(zio,
+ vd->vdev_child[rc->rc_col]);
+ rc->rc_error = ECKSUM;
+ unexpected_errors++;
+ }
+ zio_buf_free(orig, rc->rc_size);
+ }
+ goto done;
+ }
+
+ /*
+ * If there was exactly one I/O error, it's the one we expected,
+ * and the reconstructed data checksums, the read is complete.
+ * This happens when one child is offline and vdev_fault_assess()
+ * knows it, or when one child has stale data and the DTL knows it.
+ */
+ if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
+ rc = &rm->rm_col[c];
+ ASSERT(unexpected_errors == 0);
+ ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
+ vdev_raidz_reconstruct(rm, c);
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ goto done;
+ }
+ }
+
+ /*
+ * This isn't a typical error -- either we got a read error or
+ * more than one child claimed a problem. Read every block we
+ * haven't already so we can try combinatorial reconstruction.
+ */
+ unexpected_errors = 1;
+ rm->rm_missing_child = -1;
+
+ for (c = 0; c < rm->rm_cols; c++)
+ if (!rm->rm_col[c].rc_tried)
+ break;
+
+ if (c != rm->rm_cols) {
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_tried)
+ continue;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_col],
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /*
+ * If there were more errors than parity disks, give up.
+ */
+ if (zio->io_numerrors > rm->rm_firstdatacol) {
+ ASSERT(zio->io_error != 0);
+ goto done;
+ }
+
+ /*
+ * The number of I/O errors is correctable. Correct them here.
+ */
+ ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_tried);
+ if (rc->rc_error) {
+ vdev_raidz_reconstruct(rm, c);
+ if (zio_checksum_error(zio) == 0)
+ zio->io_error = 0;
+ else
+ zio->io_error = rc->rc_error;
+ goto done;
+ }
+ }
+
+ /*
+ * There were no I/O errors, but the data doesn't checksum.
+ * Try all permutations to see if we can find one that does.
+ */
+ ASSERT(zio->io_numerrors == 0);
+ for (c = 0; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ /*
+ * If this child didn't know that it returned bad data,
+ * inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ vdev_checksum_error(zio,
+ vd->vdev_child[rc->rc_col]);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+
+ /*
+ * All combinations failed to checksum.
+ */
+ zio->io_error = ECKSUM;
+
+done:
+ zio_checksum_verified(zio);
+
+ if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_col];
+
+ if (rc->rc_error) {
+ /*
+ * Make a copy of the data because we're
+ * going to free the RAID-Z map below.
+ */
+ void *data = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, data, rc->rc_size);
+
+ dprintf("%s resilvered %s @ 0x%llx error %d\n",
+ vdev_description(vd),
+ vdev_description(cvd),
+ zio->io_offset, rc->rc_error);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, data, rc->rc_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE,
+ vdev_raidz_repair_done, NULL));
+ }
+ }
+ }
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > 1)
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ vdev_raidz_open,
+ vdev_raidz_close,
+ vdev_raidz_asize,
+ vdev_raidz_io_start,
+ vdev_raidz_io_done,
+ vdev_raidz_state_change,
+ VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 0000000000..4e44b5bb05
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,98 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ int c, error;
+ int lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ continue;
+ }
+
+ *asize += cvd->vdev_asize;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ if (lasterror)
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+
+ return (lasterror);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > 0)
+ vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ else if (degraded != 0)
+ vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_root_ops = {
+ vdev_root_open,
+ vdev_root_close,
+ vdev_default_asize,
+ NULL, /* io_start - not applicable to the root */
+ NULL, /* io_done - not applicable to the root */
+ vdev_root_state_change,
+ VDEV_TYPE_ROOT, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
new file mode 100644
index 0000000000..1eddb9c250
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1010 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+#define MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10)
+
+static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
+static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
+static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+ dmu_tx_t *tx, krw_t lt);
+static void zap_put_leaf(zap_leaf_t *l);
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type;
+
+ ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+ block_type = *(uint64_t *)vbuf;
+
+ switch (block_type) {
+ case ZBT_LEAF:
+ case BSWAP_64(ZBT_LEAF):
+ zap_leaf_byteswap(vbuf);
+ return;
+ case ZBT_HEADER:
+ case BSWAP_64(ZBT_HEADER):
+ default:
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT);
+ return;
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int i;
+ zap_phys_t *zp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+ &zap->zap_f.zap_phys, zap_pageout);
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+
+ zp = zap->zap_f.zap_phys;
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
+ bzero(zp, sizeof (zap_phys_t));
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT;
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+
+ for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++)
+ zp->zap_leafs[i] = 1; /* block 1 will be the first leaf */
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<ZAP_BLOCK_SHIFT);
+ dmu_buf_will_dirty(db, tx);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+ l->l_phys = db->db_data;
+
+ zap_leaf_init(l);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static void
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t b, newblk;
+ dmu_buf_t *db_old, *db_new;
+ int hepb = 1<<(ZAP_BLOCK_SHIFT-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
+ tbl->zt_nextblk = newblk;
+ ASSERT3U(tbl->zt_blks_copied, ==, 0);
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks <<
+ ZAP_BLOCK_SHIFT);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location, leaving the odd
+ * entries blank as we go.
+ */
+
+ b = tbl->zt_blks_copied;
+ db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT);
+ dmu_buf_read(db_old);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << ZAP_BLOCK_SHIFT);
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << ZAP_BLOCK_SHIFT);
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new);
+
+ dmu_buf_rele(db_old);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << ZAP_BLOCK_SHIFT,
+ tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%lluk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+}
+
+static uint64_t
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ uint64_t blk, off, oldval;
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ blk = idx >> (ZAP_BLOCK_SHIFT-3);
+ off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+ dmu_buf_will_dirty(db, tx);
+ oldval = ((uint64_t *)db->db_data)[off];
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db);
+
+ if (tbl->zt_nextblk != 0) {
+ idx *= 2;
+ blk = idx >> (ZAP_BLOCK_SHIFT-3);
+ off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT);
+ dmu_buf_will_dirty(db, tx);
+ ((uint64_t *)db->db_data)[off] = val;
+ ((uint64_t *)db->db_data)[off+1] = val;
+ dmu_buf_rele(db);
+ }
+
+ return (oldval);
+}
+
+static uint64_t
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+{
+ uint64_t blk, off, val;
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ blk = idx >> (ZAP_BLOCK_SHIFT-3);
+ off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+ dmu_buf_read(db);
+ val = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db);
+ return (val);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ int i;
+ for (i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2*i+0] = lb;
+ dst[2*i+1] = lb;
+ }
+}
+
+static void
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
+ return;
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * The ptrtbl can no longer be contained in the
+ * header block. Give it its own entire block, which
+ * will quadruple the size of the ptrtbl.
+ */
+ uint64_t newblk;
+ dmu_buf_t *db_new;
+
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ ZAP_PTRTBL_MIN_SHIFT);
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+ newblk = zap_allocate_blocks(zap, 1, tx);
+ db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << ZAP_BLOCK_SHIFT);
+
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs,
+ db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT);
+ dmu_buf_rele(db_new);
+
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ (ZAP_BLOCK_SHIFT-3));
+ } else {
+ zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx);
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+
+ ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+
+ zap->zap_f.zap_phys->zap_num_entries += delta;
+
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
+{
+ uint64_t newblk;
+ ASSERT(tx != NULL);
+ if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ }
+ newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
+ nblocks;
+ return (newblk);
+}
+
+
+/*
+ * This function doesn't increment zap_num_leafs because it's used to
+ * allocate a leaf chain, which doesn't count against zap_num_leafs.
+ * The directory must be held exclusively for this tx.
+ */
+zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ void *winner;
+ zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ /* hence we already dirtied zap->zap_dbuf */
+
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1, tx);
+ l->l_next = NULL;
+ l->l_dbuf = NULL;
+ l->l_phys = NULL;
+
+ l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << ZAP_BLOCK_SHIFT);
+ winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+ ASSERT(winner == NULL);
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l);
+
+ return (l);
+}
+
+/* ARGSUSED */
+void
+zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ /* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf);
+ /* XXX there are still holds on this block, so we can't free it? */
+ /* dmu_free_range(zap->zap_objset, zap->zap_object, */
+ /* offset, 1<<ZAP_BLOCK_SHIFT, tx); */
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap->zap_f.zap_phys->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+static void
+zap_put_leaf(zap_leaf_t *l)
+{
+ zap_leaf_t *nl = l->l_next;
+ while (nl) {
+ zap_leaf_t *nnl = nl->l_next;
+ rw_exit(&nl->l_rwlock);
+ dmu_buf_rele(nl->l_dbuf);
+ nl = nnl;
+ }
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+ zap_leaf_t *l = vl;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ zap_leaf_t *l, *winner;
+
+ ASSERT(blkid != 0);
+
+ l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, 0, 0, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_next = NULL;
+ l->l_dbuf = db;
+ l->l_phys = NULL;
+
+ winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_pageout(NULL, l);
+ l = winner;
+ }
+
+ return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ blkid << ZAP_BLOCK_SHIFT);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
+ ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT);
+ ASSERT(blkid != 0);
+
+ dmu_buf_read(db);
+ l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise l->l_phys could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+ ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+ zap_leaf_t *l, *nl;
+
+ l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+
+ nl = l;
+ while (nl->lh_next != 0) {
+ zap_leaf_t *nnl;
+ nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+ nl->l_next = nnl;
+ nl = nnl;
+ }
+
+ return (l);
+}
+
+static uint64_t
+zap_idx_to_blk(zap_t *zap, uint64_t idx)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+ return (zap->zap_f.zap_phys->zap_leafs[idx]);
+ } else {
+ return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx));
+ }
+}
+
+static void
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+ zap->zap_f.zap_phys->zap_leafs[idx] = blk;
+ } else {
+ (void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, blk, tx);
+ }
+}
+
+static zap_leaf_t *
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+{
+ uint64_t idx;
+ zap_leaf_t *l;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+ ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+ idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
+
+ ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+
+ return (l);
+}
+
+
+static zap_leaf_t *
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+{
+ zap_leaf_t *nl;
+ int prefix_diff, i, err;
+ uint64_t sibling;
+
+ ASSERT3U(l->lh_prefix_len, <=,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ /* failed to upgrade */
+ int old_prefix_len = l->lh_prefix_len;
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+ ASSERT3U(err, ==, 0);
+ ASSERT(!zap->zap_ismicro);
+ l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+
+ if (l->lh_prefix_len != old_prefix_len)
+ /* it split while our locks were down */
+ return (l);
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ /* There's only one pointer to us. Chain on another leaf blk. */
+ (void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
+ dprintf("chaining leaf %x/%d\n", l->lh_prefix,
+ l->lh_prefix_len);
+ return (l);
+ }
+
+ ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+ /* There's more than one pointer to us. Split this leaf. */
+ nl = zap_leaf_split(zap, l, tx);
+
+ /* set sibling pointers */
+ prefix_diff =
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+ sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
+ zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ /* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
+ }
+
+ zap->zap_f.zap_phys->zap_num_leafs++;
+
+ if (hash & (1ULL << (64 - l->lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ l = nl;
+ } else {
+ zap_put_leaf(nl);
+ }
+
+ return (l);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap,
+ zap_leaf_t *l, dmu_tx_t *tx)
+{
+ int shift, err;
+
+again:
+ shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+ if (l->lh_prefix_len == shift &&
+ (l->l_next != NULL || l->lh_nfree < MIN_FREE)) {
+ /* this leaf will soon make us grow the pointer table */
+
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+ uint64_t blkid = l->l_blkid;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, &zap);
+ ASSERT3U(err, ==, 0);
+ l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+ goto again;
+ }
+
+ zap_put_leaf(l);
+ zap_grow_ptrtbl(zap, tx);
+ } else {
+ zap_put_leaf(l);
+ }
+}
+
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /* Make sure we won't overflow */
+ if (integer_size * num_integers < num_integers)
+ return (EINVAL);
+ if (integer_size * num_integers > DMU_MAX_ACCESS)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+int
+fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ err = fzap_checksize(integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err != 0)
+ goto out;
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_checksize(integer_size, num_integers) == 0);
+
+ hash = zap_hash(zap, name);
+ l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
+ ASSERT(err == ENOENT);
+
+ /* XXX If this leaf is chained, split it if we can. */
+ err = zap_entry_create(l, name, hash, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ l = zap_expand_leaf(zap, l, hash, tx);
+ goto retry;
+ }
+
+out:
+ if (lp)
+ *lp = l;
+ else
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ int err;
+ zap_leaf_t *l;
+
+ err = fzap_checksize(integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ err = fzap_add_cd(zap, name, integer_size, num_integers,
+ val, ZAP_MAXCD, tx, &l);
+
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err, create;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_checksize(integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ /* XXX If this leaf is chained, split it if we can. */
+
+ if (create) {
+ err = zap_entry_create(l, name, hash, ZAP_MAXCD,
+ integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ l = zap_expand_leaf(zap, l, hash, tx);
+ goto retry;
+ }
+
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
+ zap->zap_objset, zap->zap_object, name, err);
+ return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+{
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if (za->za_first_integer == value) {
+ (void) strcpy(name, za->za_name);
+ break;
+ }
+ }
+ kmem_free(za, sizeof (zap_attribute_t));
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+again:
+ l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+ if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(l);
+ goto again;
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(&zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+ }
+ zap_put_leaf(l);
+ return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ int i;
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
+
+ zap_stats_leaf(zap, l, zs);
+ zap_put_leaf(l);
+ }
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT;
+ zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+ zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+ zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
+ 1 << ZAP_PTRTBL_MIN_SHIFT, zs);
+ } else {
+ int b;
+
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ ZAP_BLOCK_SHIFT);
+
+ for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+
+ db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) <<
+ ZAP_BLOCK_SHIFT);
+ dmu_buf_read(db);
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(ZAP_BLOCK_SHIFT-3), zs);
+ dmu_buf_rele(db);
+ }
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 0000000000..82b786d05a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,883 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+/* somewhat arbitrary, could go up to around 100k ... */
+#define MAX_ARRAY_BYTES (8<<10)
+
+#define NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * XXX This will >> by a negative number when
+ * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
+ */
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES-1) & \
+ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+/* #define MEMCHECK */
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ }
+ ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ }
+ ASSERT(!"bad int len");
+ return (0xFEEDFACEDEADBEEF);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf)
+{
+ int i;
+
+ buf->l_hdr.lhr_block_type = BSWAP_64(buf->l_hdr.lhr_block_type);
+ buf->l_hdr.lhr_next = BSWAP_64(buf->l_hdr.lhr_next);
+ buf->l_hdr.lhr_prefix = BSWAP_64(buf->l_hdr.lhr_prefix);
+ buf->l_hdr.lhr_magic = BSWAP_32(buf->l_hdr.lhr_magic);
+ buf->l_hdr.lhr_nfree = BSWAP_16(buf->l_hdr.lhr_nfree);
+ buf->l_hdr.lhr_nentries = BSWAP_16(buf->l_hdr.lhr_nentries);
+ buf->l_hdr.lhr_prefix_len = BSWAP_16(buf->l_hdr.lhr_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+ struct zap_leaf_entry *le;
+
+ switch (buf->l_chunk[i].l_free.lf_type) {
+ case ZAP_LEAF_ENTRY:
+ le = &buf->l_chunk[i].l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_LEAF_FREE:
+ buf->l_chunk[i].l_free.lf_type =
+ BSWAP_8(buf->l_chunk[i].l_free.lf_type);
+ buf->l_chunk[i].l_free.lf_next =
+ BSWAP_16(buf->l_chunk[i].l_free.lf_next);
+ break;
+ case ZAP_LEAF_ARRAY:
+ /* zap_leaf_array */
+ buf->l_chunk[i].l_array.la_type =
+ BSWAP_8(buf->l_chunk[i].l_array.la_type);
+ buf->l_chunk[i].l_array.la_next =
+ BSWAP_16(buf->l_chunk[i].l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ ASSERT(!"bad leaf type");
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l)
+{
+ int i;
+
+ ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
+ zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+ zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+ l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
+ l->l_phys->l_chunk[i].l_free.lf_next = i+1;
+ }
+ l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
+ l->lh_block_type = ZBT_LEAF;
+ l->lh_magic = ZAP_LEAF_MAGIC;
+ l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
+}
+
+zap_leaf_t *
+zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
+{
+ nl->lh_prefix = l->lh_prefix;
+ nl->lh_prefix_len = l->lh_prefix_len;
+ nl->l_next = l->l_next;
+ l->l_next = nl;
+ nl->lh_next = l->lh_next;
+ l->lh_next = nl->l_blkid;
+ return (nl);
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ int chunk;
+
+ ASSERT(l->lh_nfree > 0);
+
+ chunk = l->l_phys->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
+
+ l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
+
+#ifdef MEMCHECK
+ zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
+ sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+ l->lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
+ ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
+
+#ifdef MEMCHECK
+ zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
+ sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+ zlf->lf_type = ZAP_LEAF_FREE;
+ zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ l->l_phys->l_hdr.lh_freelist = chunk;
+
+ l->lh_nfree++;
+}
+
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value;
+ int shift = (integer_size-1)*8;
+ int len = num_integers;
+ zap_leaf_t *l = zeh->zeh_found_leaf;
+
+ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+ int i;
+
+ la->la_type = ZAP_LEAF_ARRAY;
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = (value & (0xff << shift)) >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+ zap_leaf_t *l = zeh->zeh_found_leaf;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
+ ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
+ ZAP_LEAF_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ char *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+ zap_leaf_t *l = zeh->zeh_found_leaf;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+ int i;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, buf, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ buf += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
+ int array_len, const char *buf)
+{
+ int bseen = 0;
+ zap_leaf_t *l = zeh->zeh_found_leaf;
+
+ while (bseen < array_len) {
+ struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+ int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ if (bcmp(la->la_array, buf + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh)
+{
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+
+ zeh->zeh_head_leaf = l;
+
+again:
+ ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (chunkp = LEAF_HASH_ENTPTR(l, h);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = &l->l_phys->l_chunk[chunk].l_entry;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ if (le->le_hash != h)
+ continue;
+
+ zeh->zeh_found_leaf = l;
+ if (zap_leaf_array_equal(zeh, le->le_name_chunk,
+ le->le_name_length, name)) {
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_found_leaf = l;
+ return (0);
+ }
+ }
+
+ if (l->l_next) {
+ l = l->l_next;
+ goto again;
+ }
+
+ return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+static int
+hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2)
+{
+ if (h1 > h2)
+ return (TRUE);
+ if (h1 == h2 && cd1 >= cd2)
+ return (TRUE);
+ return (FALSE);
+}
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = ZAP_MAXCD;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
+ uint16_t lh;
+ struct zap_leaf_entry *le;
+
+ zeh->zeh_head_leaf = l;
+
+again:
+ ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (chunk = l->l_phys->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = &l->l_phys->l_chunk[chunk].l_entry;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ if (hcd_gteq(le->le_hash, le->le_cd, h, cd) &&
+ hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_found_leaf = l;
+ }
+ }
+ }
+
+ if (l->l_next) {
+ l = l->l_next;
+ goto again;
+ }
+
+ return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le;
+
+ le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ if (le->le_int_size > integer_size)
+ return (EINVAL);
+
+ zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
+ le->le_value_length, integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (EOVERFLOW);
+ return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+ struct zap_leaf_entry *le;
+
+ le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ zap_leaf_array_read(zeh, le->le_name_chunk, 1,
+ le->le_name_length, 1, buflen, buf);
+ if (le->le_name_length > buflen)
+ return (EOVERFLOW);
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ int delta_chunks;
+ struct zap_leaf_entry *le;
+ le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+
+ delta_chunks = NCHUNKS(num_integers * integer_size) -
+ NCHUNKS(le->le_value_length * le->le_int_size);
+
+ if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
+ return (EAGAIN);
+
+ /*
+ * We should search other chained leaves (via
+ * zap_entry_remove,create?) otherwise returning EAGAIN will
+ * just send us into an infinite loop if we have to chain
+ * another leaf block, rather than being able to split this
+ * block.
+ */
+
+ zap_leaf_array_free(zeh, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+ le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+ (MAX_ARRAY_BYTES + 1) : (num_integers);
+ le->le_int_size = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ uint16_t entry_chunk;
+ struct zap_leaf_entry *le;
+ zap_leaf_t *l = zeh->zeh_found_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ entry_chunk = *zeh->zeh_chunkp;
+ le = &l->l_phys->l_chunk[entry_chunk].l_entry;
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ zap_leaf_array_free(zeh, &le->le_name_chunk);
+ zap_leaf_array_free(zeh, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ l->lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+ uint64_t namelen, valuelen;
+ int numchunks;
+
+ valuelen = integer_size * num_integers;
+ namelen = strlen(name) + 1;
+ ASSERT(namelen >= 2);
+
+ zeh->zeh_head_leaf = l;
+
+ if (namelen > MAXNAMELEN)
+ return (ENAMETOOLONG);
+ /* find the first leaf in the chain that has sufficient free space */
+ numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS)
+ return (E2BIG);
+
+ if (cd == ZAP_MAXCD) {
+ for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ zap_leaf_t *ll;
+ for (ll = l; ll; ll = ll->l_next) {
+ for (chunk = *LEAF_HASH_ENTPTR(ll, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = &ll->l_phys->l_chunk
+ [chunk].l_entry;
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /*
+ * if this cd is in use, no need to
+ * check more chained leafs
+ */
+ if (chunk != CHAIN_END)
+ break;
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ /* If we tried all the cd's, we lose. */
+ if (cd == ZAP_MAXCD)
+ return (ENOSPC);
+ }
+
+ for (; l; l = l->l_next)
+ if (l->lh_nfree >= numchunks)
+ break;
+ if (l == NULL)
+ return (EAGAIN);
+
+ zeh->zeh_found_leaf = l;
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = &l->l_phys->l_chunk[chunk].l_entry;
+ le->le_type = ZAP_LEAF_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
+ le->le_name_length = namelen;
+ le->le_value_chunk =
+ zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+ le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+ (MAX_ARRAY_BYTES + 1) : (num_integers);
+ le->le_int_size = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ chunkp = LEAF_HASH_ENTPTR(l, h);
+ le->le_next = *chunkp;
+ *chunkp = chunk;
+
+ l->lh_nentries++;
+
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static void
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
+ uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
+ le->le_next = *ptr;
+ *ptr = entry;
+}
+
+static void
+zap_leaf_rehash_entries(zap_leaf_t *l)
+{
+ int i;
+
+ if (l->lh_nentries == 0)
+ return;
+
+ /* break existing hash chains */
+ zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+ struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+ if (le->le_type != ZAP_LEAF_ENTRY)
+ continue;
+ zap_leaf_rehash_entry(l, i);
+ }
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &nl->l_phys->l_chunk[nchunk].l_array;
+ struct zap_leaf_array *la =
+ &l->l_phys->l_chunk[chunk].l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
+
+ *nla = *la;
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
+ dmu_tx_t *tx)
+{
+ zap_leaf_t *nl;
+ struct zap_leaf_entry *le, *nle;
+ uint16_t chunk, nchunks;
+
+ le = &l->l_phys->l_chunk[entry].l_entry;
+ ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+ /* find a leaf in the destination leaf chain with enough free space */
+ nchunks = 1 + NCHUNKS(le->le_name_length) +
+ NCHUNKS(le->le_value_length * le->le_int_size);
+ for (nl = nhl; nl; nl = nl->l_next)
+ if (nl->lh_nfree >= nchunks)
+ break;
+ if (nl == NULL) {
+ nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
+ dprintf("transfer_entry: chaining leaf %x/%d\n",
+ nl->lh_prefix, nl->lh_prefix_len);
+ }
+
+ chunk = zap_leaf_chunk_alloc(nl);
+ nle = &nl->l_phys->l_chunk[chunk].l_entry;
+ *nle = *le;
+
+ zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ l->lh_nentries--;
+ nl->lh_nentries++;
+}
+
+/*
+ * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
+ * Ignore leaf chaining in source (l), but chain in destinations.
+ * We'll re-chain all the entries in l as we go along.
+ */
+static void
+zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
+ zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
+{
+ int i;
+
+ ASSERT(bit < 64 && bit >= 0);
+ /* break existing hash chains */
+ zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+ if (nl0 != l)
+ zap_leaf_rehash_entries(nl0);
+ if (nl1 != nl0)
+ zap_leaf_rehash_entries(nl1);
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+ struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+ if (le->le_type != ZAP_LEAF_ENTRY)
+ continue;
+
+ /*
+ * We could find entries via hashtable instead. That
+ * would be O(hashents+numents) rather than
+ * O(numblks+numents), but this accesses memory more
+ * sequentially, and when we're called, the block is
+ * usually pretty full.
+ */
+
+ if (le->le_hash & (1ULL << bit)) {
+ zap_leaf_transfer_entry(zap, l, i, nl1, tx);
+ } else {
+ if (nl0 == l)
+ zap_leaf_rehash_entry(l, i);
+ else
+ zap_leaf_transfer_entry(zap, l, i, nl0, tx);
+ }
+ }
+
+}
+
+/*
+ * nl will contain the entries whose hash prefix ends in 1
+ * handles leaf chaining
+ */
+zap_leaf_t *
+zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
+{
+ zap_leaf_t *l = hl;
+ int bit = 64 - 1 - hl->lh_prefix_len;
+ zap_leaf_t *nl = zap_create_leaf(zap, tx);
+
+ /* set new prefix and prefix_len */
+ hl->lh_prefix <<= 1;
+ hl->lh_prefix_len++;
+ nl->lh_prefix = hl->lh_prefix | 1;
+ nl->lh_prefix_len = hl->lh_prefix_len;
+
+ /* transfer odd entries from first leaf in hl chain to nl */
+ zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
+
+ /* take rest of chain off hl */
+ l = hl->l_next;
+ hl->l_next = NULL;
+ hl->lh_next = 0;
+
+ /* transfer even entries from hl chain back to hl, odd entries to nl */
+ while (l) {
+ zap_leaf_t *next = l->l_next;
+ zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
+ zap_destroy_leaf(zap, l, tx);
+ l = next;
+ }
+
+ return (nl);
+}
+
+void
+zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int n, nchained = 0;
+
+ n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+ do {
+ int i;
+
+ n = l->lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<ZAP_BLOCK_SHIFT) -
+ l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<ZAP_BLOCK_SHIFT);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
+ int nentries = 0;
+ int chunk = l->l_phys->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ &l->l_phys->l_chunk[chunk].l_entry;
+
+ n = 1 + NCHUNKS(le->le_name_length) +
+ NCHUNKS(le->le_value_length *
+ le->le_int_size);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+
+ nchained++;
+ l = l->l_next;
+ } while (l);
+
+ n = nchained-1;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_n_chained[n]++;
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 0000000000..998b67c50f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/avl.h>
+
+
+static uint64_t mzap_write_cookie(zap_t *zap, uint64_t cookie,
+ uint64_t entptr);
+static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ int i, max;
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ max = (size / MZAP_ENT_LEN) - 1;
+ for (i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)buf;
+
+ switch (block_type) {
+ case ZBT_MICRO:
+ case BSWAP_64(ZBT_MICRO):
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ return;
+ default:
+ ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+ fzap_byteswap(buf, size);
+ return;
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ if (mze1->mze_hash > mze2->mze_hash)
+ return (+1);
+ if (mze1->mze_hash < mze2->mze_hash)
+ return (-1);
+ if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ return (+1);
+ if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ return (-1);
+ return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+ mzap_ent_t *mze;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(mzep->mze_cd < ZAP_MAXCD);
+ ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
+
+ mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_phys = *mzep;
+ avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_t *zap, const char *name, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(zap_hash(zap, name), ==, hash);
+
+ if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+ return (NULL);
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (strcmp(name, mze->mze_phys.mze_name) == 0)
+ return (mze);
+ }
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t cd;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ cd = 0;
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_phys.mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ zap_t *zap;
+ int i;
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, 0, 0, 0);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_pageout);
+
+ if (winner != NULL) {
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+ }
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap->zap_m.zap_num_entries++;
+ mze_insert(zap, i,
+ zap_hash(zap, mze->mze_name), mze);
+ }
+ }
+ } else {
+ zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp)
+{
+ zap_t *zap;
+ dmu_buf_t *db;
+ krw_t lt;
+ int err;
+
+ *zapp = NULL;
+
+ db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ /*
+ * The zap can deal with EIO here, but its callers don't yet, so
+ * spare them by doing a mustsucceed read.
+ */
+ dmu_buf_read(db);
+
+ zap = dmu_buf_get_user(db);
+ if (zap == NULL)
+ zap = mzap_open(os, obj, db);
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ mzap_upgrade(zap, tx);
+ *zapp = zap;
+ return (0);
+ }
+ err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+ ASSERT3U(err, ==, 0);
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf);
+}
+
+static void
+mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ mzap_phys_t *mzp;
+ int i, sz, nchunks, err;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ sz = zap->zap_dbuf->db_size;
+ mzp = kmem_alloc(sz, KM_SLEEP);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ nchunks = zap->zap_m.zap_num_chunks;
+
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << ZAP_BLOCK_SHIFT, 0, tx);
+ ASSERT(err == 0);
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx);
+
+ for (i = 0; i < nchunks; i++) {
+ int err;
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ err = fzap_add_cd(zap,
+ mze->mze_name, 8, 1, &mze->mze_value,
+ mze->mze_cd, tx, NULL);
+ ASSERT3U(err, ==, 0);
+ }
+ kmem_free(mzp, sz);
+}
+
+uint64_t
+zap_hash(zap_t *zap, const char *name)
+{
+ const uint8_t *cp;
+ uint8_t c;
+ uint64_t crc = zap->zap_salt;
+
+ ASSERT(crc != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the onces that we first pay attention to when
+ * chosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
+
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ mzap_phys_t *zp;
+
+ db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ dmu_buf_will_dirty(db, tx);
+ zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+ ASSERT(zp->mz_salt != 0);
+ dmu_buf_rele(db);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ if (err != 0)
+ return (err);
+ mzap_create_impl(os, obj, tx);
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ mzap_create_impl(os, obj, tx);
+ return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_pageout(dmu_buf_t *db, void *vmzap)
+{
+ zap_t *zap = vmzap;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro) {
+ mze_destroy(zap);
+ }
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zap, name,
+ integer_size, num_integers, buf);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (num_integers < 1)
+ err = EOVERFLOW;
+ else if (integer_size != 8)
+ err = EINVAL;
+ else
+ *(uint64_t *)buf = mze->mze_phys.mze_value;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zap, name, integer_size, num_integers);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+static void
+mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+{
+ int i;
+ int start = zap->zap_m.zap_alloc_next;
+ uint32_t cd;
+
+ dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ ASSERT(strcmp(name, mze->mze_name) != 0);
+ }
+#endif
+
+ cd = mze_find_unused_cd(zap, hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd != ZAP_MAXCD);
+
+again:
+ for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strcpy(mze->mze_name, name);
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ mze_insert(zap, i, hash, mze);
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ err = EEXIST;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ mze->mze_phys.mze_value = *intval;
+ zap->zap_m.zap_phys->mz_chunk
+ [mze->mze_chunkid].mze_value = *intval;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (0);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zap, name, tx);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ dprintf("fail: %s\n", name);
+ err = ENOENT;
+ } else {
+ dprintf("success: %s\n", name);
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zc->zc_objset = os;
+ zc->zc_zapobj = zapobj;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
+}
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zc->zc_objset = os;
+ zc->zc_zapobj = zapobj;
+ if (serialized == -1ULL) {
+ zc->zc_hash = -1ULL;
+ zc->zc_cd = 0;
+ } else {
+ zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+ zc->zc_cd = serialized >> ZAP_HASHBITS;
+ if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+ zc->zc_cd = 0;
+ }
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+ ASSERT(zc->zc_cd < ZAP_MAXCD);
+ return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+ ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ zap_t *zap;
+ int err;
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+
+ if (zc->zc_hash == -1ULL)
+ return (ENOENT);
+
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zap, zc, za);
+ } else {
+ err = ENOENT;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+ mze = avl_find(&zap->zap_m.zap_avl, &mze_tofind, &idx);
+ ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
+ &zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mze->mze_phys)));
+ if (mze == NULL)
+ mze = avl_nearest(&zap->zap_m.zap_avl, idx, AVL_AFTER);
+
+ if (mze) {
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mze->mze_phys.mze_value;
+ (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_phys.mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+ if (zc->zc_cd >= ZAP_MAXCD) {
+ zc->zc_cd = 0;
+ zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+ if (zc->zc_hash == 0) /* EOF */
+ zc->zc_hash = -1ULL;
+ }
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ int err;
+ zap_t *zap;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs.conf b/usr/src/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 0000000000..09881909b8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 0000000000..960de720d1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,1537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/fs/zfs.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <util/qsort.h>
+#include "fs/fs_subr.h"
+#include <acl/acl_common.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+#define SECURE_NO_INHERIT (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_PAD 6 /* traditional owner/group/everyone ACES */
+
+static int zfs_ace_can_use(znode_t *zp, ace_t *);
+
+static zfs_acl_t *
+zfs_acl_alloc(int slots)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ if (slots != 0) {
+ aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
+ aclp->z_acl_count = 0;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ aclp->z_state = 0;
+ }
+ aclp->z_slots = slots;
+ return (aclp);
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ if (aclp->z_state == ACL_DATA_ALLOCED) {
+ kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
+ }
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static uint32_t
+zfs_v4_to_unix(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY))
+ new_mask |= S_IROTH;
+ if (access_mask & (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_ADD_FILE))
+ new_mask |= S_IWOTH;
+ if (access_mask & (ACE_EXECUTE|ACE_READ_NAMED_ATTRS))
+ new_mask |= S_IXOTH;
+
+ return (new_mask);
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & 01)
+ new_mask |= (ACE_EXECUTE);
+ if (access_mask & 02) {
+ new_mask |= (ACE_WRITE_DATA);
+ } if (access_mask & 04) {
+ new_mask |= ACE_READ_DATA;
+ }
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
+ uid_t uid, int entry_type)
+{
+ zacep->a_access_mask = access_mask;
+ zacep->a_type = access_type;
+ zacep->a_who = uid;
+ zacep->a_flags = entry_type;
+}
+
+static uint64_t
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+{
+ int i;
+ int entry_type;
+ mode_t mode = (zp->z_phys->zp_mode &
+ (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode_t seen = 0;
+ ace_t *acep;
+
+ for (i = 0, acep = aclp->z_acl;
+ i != aclp->z_acl_count; i++, acep++) {
+ entry_type = (acep->a_flags & 0xf040);
+ if (entry_type == ACE_OWNER) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((acep->a_access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ }
+ }
+ return (mode);
+}
+
+static zfs_acl_t *
+zfs_acl_node_read_internal(znode_t *zp)
+{
+ zfs_acl_t *aclp;
+
+ aclp = zfs_acl_alloc(0);
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+ aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+
+ return (aclp);
+}
+
+/*
+ * Read an external acl object.
+ */
+zfs_acl_t *
+zfs_acl_node_read(znode_t *zp)
+{
+ uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
+ zfs_acl_t *aclp;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
+ return (zfs_acl_node_read_internal(zp));
+
+ aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+
+ dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+ ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+
+ return (aclp);
+}
+
+static boolean_t
+zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
+{
+ ace_t *acep;
+ int i;
+
+ *inherit = 0;
+
+ if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
+ return (B_FALSE);
+ }
+
+ for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
+
+ /*
+ * first check type of entry
+ */
+
+ switch (acep->a_flags & 0xf040) {
+ case ACE_OWNER:
+ acep->a_who = -1;
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ if (acep->a_flags & ACE_GROUP) {
+ acep->a_who = -1;
+ }
+ break;
+ case ACE_EVERYONE:
+ acep->a_who = -1;
+ break;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (acep->a_type != ALLOW && acep->a_type != DENY)
+ return (B_FALSE);
+
+ /*
+ * Only directories should have inheritance flags.
+ */
+ if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
+ return (B_FALSE);
+ }
+
+ if (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
+ *inherit = 1;
+
+ if (acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+ }
+
+ return (B_TRUE);
+}
+/*
+ * common code for setting acl's.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+{
+ int inherit = 0;
+ int error;
+ znode_phys_t *zphys = zp->z_phys;
+ zfs_znode_acl_t *zacl = &zphys->zp_acl;
+ uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (ihp)
+ inherit = *ihp; /* already determined by caller */
+ else if (!zfs_acl_valid(zp, aclp->z_acl,
+ aclp->z_acl_count, &inherit)) {
+ return (EINVAL);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Will ACL fit internally?
+ */
+ if (aclp->z_acl_count > ACE_SLOT_CNT) {
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
+ acl_phys_size, 0, tx);
+ }
+ zphys->zp_acl.z_acl_extern_obj = aoid;
+ zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+ dmu_write(zfsvfs->z_os, aoid, 0,
+ acl_phys_size, aclp->z_acl, tx);
+ } else {
+ /*
+ * Migrating back embedded?
+ */
+ if (zphys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ zphys->zp_acl.z_acl_extern_obj = 0;
+ }
+ bcopy(aclp->z_acl, zacl->z_ace_data,
+ aclp->z_acl_count * sizeof (ace_t));
+ zacl->z_acl_count = aclp->z_acl_count;
+ }
+ if (inherit)
+ zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
+ else
+ zp->z_phys->zp_flags &= ~ZFS_INHERIT_ACE;
+
+ zphys->zp_mode = zfs_mode_compute(zp, aclp);
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+ return (0);
+}
+
+/*
+ * Create space for slots_needed ACEs to be append
+ * to aclp.
+ */
+static void
+zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
+{
+ ace_t *newacep;
+ ace_t *oldaclp;
+ int slot_cnt;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+ if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
+ slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left);
+ newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
+ bcopy(aclp->z_acl, newacep,
+ ZFS_ACL_SIZE(aclp->z_acl_count));
+ oldaclp = aclp->z_acl;
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
+ aclp->z_acl = newacep;
+ aclp->z_slots = slot_cnt;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ }
+}
+
+/*
+ * Remove "slot" ACE from aclp
+ */
+static void
+zfs_ace_remove(zfs_acl_t *aclp, int slot)
+{
+ if (aclp->z_acl_count > 1) {
+ (void) memmove(&aclp->z_acl[slot],
+ &aclp->z_acl[slot +1], sizeof (ace_t) *
+ (--aclp->z_acl_count - slot));
+ } else
+ aclp->z_acl_count--;
+}
+
+/*
+ * Update access mask for prepended ACE
+ *
+ * This applies the "groupmask" value for aclmode property.
+ */
+static void
+zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+{
+
+ int rmask, wmask, xmask;
+ int user_ace;
+
+ user_ace = (!(acep->a_flags &
+ (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
+
+ if (user_ace && (acep->a_who == owner)) {
+ rmask = S_IRUSR;
+ wmask = S_IWUSR;
+ xmask = S_IXUSR;
+ } else {
+ rmask = S_IRGRP;
+ wmask = S_IWGRP;
+ xmask = S_IXGRP;
+ }
+
+ if (origacep->a_access_mask & ACE_READ_DATA) {
+ if (mode & rmask)
+ acep->a_access_mask &= ~ACE_READ_DATA;
+ else
+ acep->a_access_mask |= ACE_READ_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_WRITE_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_WRITE_DATA;
+ else
+ acep->a_access_mask |= ACE_WRITE_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_APPEND_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_APPEND_DATA;
+ else
+ acep->a_access_mask |= ACE_APPEND_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_EXECUTE) {
+ if (mode & xmask)
+ acep->a_access_mask &= ~ACE_EXECUTE;
+ else
+ acep->a_access_mask |= ACE_EXECUTE;
+ }
+}
+
+/*
+ * Apply mode to canonical six ACEs.
+ */
+static void
+zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
+{
+ int cnt;
+ ace_t *acep;
+
+ cnt = aclp->z_acl_count -1;
+ acep = aclp->z_acl;
+
+ /*
+ * Fixup final ACEs to match the mode
+ */
+
+ ASSERT(cnt >= 5);
+ adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */
+ adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */
+ adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */
+}
+
+
+static int
+zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+{
+ return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
+ ((acep->a_flags & 0xf040) == type));
+}
+
+/*
+ * Can prepended ACE be reused?
+ */
+static int
+zfs_reuse_deny(ace_t *acep, int i)
+{
+ int okay_masks;
+
+ if (i < 1)
+ return (B_FALSE);
+
+ if (acep[i-1].a_type != DENY)
+ return (B_FALSE);
+
+ if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+ return (B_FALSE);
+
+ okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+
+ if (acep[i-1].a_access_mask & ~okay_masks)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Create space to prepend an ACE
+ */
+static void
+zfs_acl_prepend(zfs_acl_t *aclp, int i)
+{
+ ace_t *oldaclp = NULL;
+ ace_t *to, *from;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+ int oldslots;
+ int need_free = 0;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
+
+ to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
+ OGE_PAD), KM_SLEEP);
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ need_free++;
+ from = aclp->z_acl;
+ oldaclp = aclp->z_acl;
+ (void) memmove(to, from,
+ sizeof (ace_t) * aclp->z_acl_count);
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ from = aclp->z_acl;
+ to = aclp->z_acl;
+ }
+
+
+ (void) memmove(&to[i + 1], &from[i],
+ sizeof (ace_t) * (aclp->z_acl_count - i));
+
+ if (oldaclp) {
+ aclp->z_acl = to;
+ oldslots = aclp->z_slots;
+ aclp->z_slots = aclp->z_acl_count + OGE_PAD;
+ if (need_free)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+ }
+
+}
+
+/*
+ * Prepend deny ACE
+ */
+static void
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+ mode_t mode)
+{
+ ace_t *acep;
+
+ zfs_acl_prepend(aclp, i);
+
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
+ (acep[i + 1].a_flags & 0xf040));
+ zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
+ aclp->z_acl_count++;
+}
+
+/*
+ * Split an inherited ACE into inherit_only ACE
+ * and original ACE with inheritance flags stripped off.
+ */
+static void
+zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ zfs_acl_prepend(aclp, i);
+ acep = aclp->z_acl;
+ acep[i] = acep[i + 1];
+ acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
+ acep[i + 1].a_flags &= ~ALL_INHERIT;
+ aclp->z_acl_count++;
+}
+
+/*
+ * Are ACES started at index i, the canonical six ACES?
+ */
+static int
+zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ if ((zfs_acl_ace_match(&acep[i],
+ DENY, ACE_OWNER, 0) &&
+ zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
+ OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
+ DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
+ ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+ DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
+ zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
+ EVERYONE_ALLOW_MASK))) {
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Apply step 1g, to group entries
+ *
+ * Need to deal with corner case where group may have
+ * greater permissions than owner. If so then limit
+ * group permissions, based on what extra permissions
+ * group has.
+ */
+static void
+zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+{
+ mode_t extramode = (mode >> 3) & 07;
+ mode_t ownermode = (mode >> 6);
+
+ if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+
+ extramode &= ~ownermode;
+
+ if (extramode) {
+ if (extramode & 04) {
+ acep[0].a_access_mask &= ~ACE_READ_DATA;
+ acep[1].a_access_mask &= ~ACE_READ_DATA;
+ }
+ if (extramode & 02) {
+ acep[0].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ acep[1].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ }
+ if (extramode & 01) {
+ acep[0].a_access_mask &= ~ACE_EXECUTE;
+ acep[1].a_access_mask &= ~ACE_EXECUTE;
+ }
+ }
+ }
+}
+
+/*
+ * Apply the chmod algorithm as described
+ * in PSARC/2002/240
+ */
+static int
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
+ dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *acep;
+ int i;
+ int error;
+ int entry_type;
+ int reuse_deny;
+ int need_canonical_six = 1;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ i = 0;
+ while (i < aclp->z_acl_count) {
+ acep = aclp->z_acl;
+ entry_type = (acep[i].a_flags & 0xf040);
+
+ if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
+ (acep[i].a_flags & ACE_INHERIT_ONLY_ACE)) {
+ i++;
+ continue;
+ }
+
+
+ if (zfsvfs->z_acl_mode == DISCARD) {
+ zfs_ace_remove(aclp, i);
+ continue;
+ }
+
+ /*
+ * Need to split ace into two?
+ */
+ if ((acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) &&
+ (!(acep[i].a_flags & ACE_INHERIT_ONLY_ACE))) {
+ zfs_acl_split_ace(aclp, i);
+ i++;
+ continue;
+ }
+
+ if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) {
+ acep[i].a_access_mask &= ~OGE_CLEAR;
+ i++;
+ continue;
+
+ } else {
+ if (acep[i].a_type == ALLOW) {
+
+ /*
+ * Check preceding ACE if any, to see
+ * if we need to prepend a DENY ACE.
+ * This is only applicable when the acl_mode
+ * property == groupmask.
+ */
+ if (zfsvfs->z_acl_mode == GROUPMASK) {
+
+ reuse_deny = zfs_reuse_deny(acep, i);
+
+ if (reuse_deny == B_FALSE) {
+ zfs_acl_prepend_deny(zp, aclp,
+ i, mode);
+ i++;
+ acep = aclp->z_acl;
+ } else {
+ zfs_acl_prepend_fixup(
+ &acep[i - 1],
+ &acep[i], mode,
+ zp->z_phys->zp_uid);
+ }
+ zfs_fixup_group_entries(&acep[i - 1],
+ mode);
+ }
+ }
+ i++;
+ }
+ }
+
+ /*
+ * Check out last six aces, if we have six.
+ */
+
+ if (aclp->z_acl_count >= 6) {
+ i = aclp->z_acl_count - 6;
+
+ if (zfs_have_canonical_six(aclp, i)) {
+ need_canonical_six = 0;
+ }
+ }
+
+ if (need_canonical_six) {
+
+ zfs_acl_append(aclp, 6);
+ i = aclp->z_acl_count;
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
+ DENY, -1, ACE_EVERYONE);
+ zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
+ ALLOW, -1, ACE_EVERYONE);
+ aclp->z_acl_count += 6;
+ }
+
+ zfs_acl_fixup_canonical_six(aclp, mode);
+
+ zp->z_phys->zp_mode = mode;
+ error = zfs_aclset_common(zp, aclp, tx, NULL);
+ return (error);
+}
+
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+{
+ zfs_acl_t *aclp;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ mutex_enter(&zp->z_acl_lock);
+ aclp = zfs_acl_node_read(zp);
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_acl_lock);
+ zfs_acl_free(aclp);
+ return (error);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+{
+ if ((zfsvfs->z_acl_inherit == SECURE) &&
+ acep->a_type == ALLOW)
+ acep->a_access_mask &= ~SECURE_NO_INHERIT;
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *pacep;
+ ace_t *acep;
+ int ace_cnt = 0;
+ int pace_cnt;
+ int i, j;
+ zfs_acl_t *aclp = NULL;
+
+ i = j = 0;
+ pace_cnt = paclp->z_acl_count;
+ pacep = paclp->z_acl;
+ if (zfsvfs->z_acl_inherit != DISCARD) {
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+ ace_cnt++;
+ if (!(pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE))
+ ace_cnt++;
+ }
+ }
+ }
+
+ aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
+ if (ace_cnt && zfsvfs->z_acl_inherit != DISCARD) {
+ acep = aclp->z_acl;
+ pacep = paclp->z_acl;
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+ /*
+ * Now create entry for inherited ace
+ */
+ acep[j] = pacep[i];
+
+ if (pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE) {
+ acep[j].a_flags &= ~ALL_INHERIT;
+ j++;
+ continue;
+ }
+
+ if (pacep[i].a_type != ALLOW &&
+ pacep[i].a_type != DENY) {
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ continue;
+ }
+
+ if (ZTOV(zp)->v_type != VDIR) {
+ acep[j].a_flags &= ~ALL_INHERIT;
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ continue;
+ }
+
+ ASSERT(ZTOV(zp)->v_type == VDIR);
+
+ /*
+ * If we are inheriting an ACE targeted for
+ * only files, then leave the inherit_only
+ * one for future propagation.
+ */
+ if ((acep[j].a_flags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) !=
+ ACE_FILE_INHERIT_ACE)
+ acep[j].a_flags &=
+ ~ACE_INHERIT_ONLY_ACE;
+
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ }
+ }
+ }
+ aclp->z_acl_count = j;
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+void
+zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
+ vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+{
+ uint64_t mode;
+ uid_t uid;
+ gid_t gid;
+ int error;
+ int pull_down;
+ zfs_acl_t *aclp, *paclp;
+
+ mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ uid = vap->va_uid;
+ gid = vap->va_gid;
+ } else {
+ uid = crgetuid(cr);
+ if ((vap->va_mask & AT_GID) &&
+ ((vap->va_gid == parent->z_phys->zp_gid) ||
+ groupmember(vap->va_gid, cr) ||
+ secpolicy_vnode_create_gid(cr)))
+ gid = vap->va_gid;
+ else
+ gid = (parent->z_phys->zp_mode & S_ISGID) ?
+ parent->z_phys->zp_gid : crgetgid(cr);
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+ mode |= S_ISGID;
+ else {
+ if ((mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ mode &= ~S_ISGID;
+ }
+
+ zp->z_phys->zp_uid = uid;
+ zp->z_phys->zp_gid = gid;
+ zp->z_phys->zp_mode = mode;
+
+ mutex_enter(&parent->z_lock);
+ pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
+ if (pull_down) {
+ mutex_enter(&parent->z_acl_lock);
+ paclp = zfs_acl_node_read(parent);
+ mutex_exit(&parent->z_acl_lock);
+ aclp = zfs_acl_inherit(zp, paclp);
+ zfs_acl_free(paclp);
+ } else {
+ aclp = zfs_acl_alloc(6);
+ }
+ mutex_exit(&parent->z_lock);
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT3U(error, ==, 0);
+ zfs_acl_free(aclp);
+}
+
+/*
+ * Can use be used for inheritance
+ */
+static int
+zfs_ace_can_use(znode_t *zp, ace_t *acep)
+{
+ int vtype = ZTOV(zp)->v_type;
+
+ int iflags = (acep->a_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Retrieve a files ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ int error;
+
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
+ /*
+ * If owner of file then allow reading of the
+ * ACL.
+ */
+ if (crgetuid(cr) != zp->z_phys->zp_uid)
+ return (error);
+ }
+
+ if (mask == 0)
+ return (ENOSYS);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ aclp = zfs_acl_node_read(zp);
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = aclp->z_acl_count;
+ }
+
+ if (mask & VSA_ACE) {
+ vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
+ sizeof (ace_t), KM_SLEEP);
+ bcopy(aclp->z_acl, vsecp->vsa_aclentp,
+ aclp->z_acl_count * sizeof (ace_t));
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_acl_free(aclp);
+
+ return (0);
+}
+
+/*
+ * Set a files ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ace_t *acep = vsecp->vsa_aclentp;
+ int aclcnt = vsecp->vsa_aclcnt;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ int inherit;
+ zfs_acl_t *aclp;
+ uint64_t seq = 0;
+
+ if (mask == 0)
+ return (EINVAL);
+
+ if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
+ return (EINVAL);
+top:
+ error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
+ if (error == EACCES || error == ACCESS_UNDETERMINED) {
+ if ((error = secpolicy_vnode_setdac(cr,
+ zp->z_phys->zp_uid)) != 0) {
+ return (error);
+ }
+ } else if (error) {
+ return (error == EROFS ? error : EPERM);
+ }
+
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
+ 0, ZFS_ACL_SIZE(aclcnt));
+ } else if (aclcnt > ACE_SLOT_CNT) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ return (error);
+ }
+
+ aclp = zfs_acl_alloc(aclcnt);
+ bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
+ aclp->z_acl_count = aclcnt;
+ error = zfs_aclset_common(zp, aclp, tx, &inherit);
+ ASSERT(error == 0);
+
+ zfs_acl_free(aclp);
+ seq = zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
+ dmu_tx_commit(tx);
+done:
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ zil_commit(zilog, seq, 0);
+
+ return (error);
+}
+
+static int
+zfs_ace_access(ace_t *zacep, int mode_wanted, int *working_mode)
+{
+ if ((*working_mode & mode_wanted) == mode_wanted) {
+ return (0);
+ }
+
+ if (zacep->a_access_mask & mode_wanted) {
+ if (zacep->a_type == ALLOW) {
+ *working_mode |= (mode_wanted & zacep->a_access_mask);
+ if ((*working_mode & mode_wanted) == mode_wanted)
+ return (0);
+ } else if (zacep->a_type == DENY) {
+ return (EACCES);
+ }
+ }
+
+ /*
+ * haven't been specifcally denied at this point
+ * so return UNDETERMINED.
+ */
+
+ return (ACCESS_UNDETERMINED);
+}
+
+
+static int
+zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *zacep;
+ gid_t gid;
+ int mode_wanted = v4_mode;
+ int cnt;
+ int i;
+ int access_deny = ACCESS_UNDETERMINED;
+ uint_t entry_type;
+ uid_t uid = crgetuid(cr);
+
+ *working_mode = 0;
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
+ return (0);
+
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)))) {
+ return (EROFS);
+ }
+
+ mutex_enter(&zp->z_acl_lock);
+
+ aclp = zfs_acl_node_read(zp);
+
+ zacep = aclp->z_acl;
+ cnt = aclp->z_acl_count;
+
+ for (i = 0; i != cnt; i++) {
+
+ if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ entry_type = (zacep[i].a_flags & 0xf040);
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == zp->z_phys->zp_uid) {
+ access_deny = zfs_ace_access(&zacep[i],
+ mode_wanted, working_mode);
+ }
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ /*
+ * Owning group gid is in znode not ACL
+ */
+ if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
+ gid = zp->z_phys->zp_gid;
+ else
+ gid = zacep[i].a_who;
+
+ if (groupmember(gid, cr)) {
+ access_deny = zfs_ace_access(&zacep[i],
+ mode_wanted, working_mode);
+ }
+ break;
+ case ACE_EVERYONE:
+ access_deny = zfs_ace_access(&zacep[i],
+ mode_wanted, working_mode);
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ if (uid == zacep[i].a_who) {
+ access_deny = zfs_ace_access(&zacep[i],
+ mode_wanted, working_mode);
+ }
+ break;
+ }
+ zfs_acl_free(aclp);
+ mutex_exit(&zp->z_acl_lock);
+ return (EIO);
+ }
+
+ if (access_deny != ACCESS_UNDETERMINED)
+ break;
+
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ zfs_acl_free(aclp);
+
+ return (access_deny);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied, invoking least
+ * priv subsytem when a deny is determined.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode = 0;
+ int error;
+ int is_attr;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+
+ is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
+ (ZTOV(zp)->v_type == VDIR));
+
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ zp->z_phys->zp_parent, &xzp)) != 0) {
+ return (error);
+ }
+ check_zp = xzp;
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
+ error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+
+ if (error == EROFS) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error || (working_mode != mode)) {
+ error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+ check_zp->z_phys->zp_uid, ~zfs_v4_to_unix(working_mode));
+ }
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Special zaccess function to check for special nfsv4 perm.
+ * doesn't call secpolicy_vnode_access() for failure, since that
+ * would probably be the wrong policy function to call.
+ * instead its up to the caller to handle that situation.
+ */
+
+int
+zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode = 0;
+ return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+}
+
+/*
+ * Translate tradition unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, cr));
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ int dzp_working_mode = 0;
+ int zp_working_mode = 0;
+ int dzp_error, zp_error;
+
+ /*
+ * Arghh, this check is going to require a couple of questions
+ * to be asked. We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:sloar:write_data:deny,user:sloar:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, cr);
+ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
+
+ if (dzp_error == EROFS || zp_error == EROFS)
+ return (dzp_error);
+
+ /*
+ * First handle the first row
+ */
+ if (dzp_working_mode & ACE_DELETE_CHILD)
+ return (0);
+
+ /*
+ * Second row
+ */
+
+ if (zp_working_mode & ACE_DELETE)
+ return (0);
+
+ /*
+ * Third Row
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
+ &dzp_working_mode, cr);
+
+ if (dzp_error == EROFS)
+ return (dzp_error);
+
+ if (dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE))
+ return (0);
+
+ /*
+ * Fourth Row
+ */
+
+ if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) &&
+ (zp_working_mode & ACE_DELETE))
+ return (0);
+
+ return (secpolicy_vnode_access(cr, ZTOV(zp), dzp->z_phys->zp_uid,
+ S_IWRITE|S_IEXEC));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ */
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if (error = zfs_zaccess_delete(sdzp, szp, cr))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ if (error = zfs_zaccess(sdzp, add_perm, cr))
+ return (error);
+
+ error = zfs_sticky_remove_access(sdzp, szp, cr);
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_byteswap.c b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 0000000000..e1e857aa44
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_ace_byteswap((ace_t *)buf, cnt);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+ zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_state = BSWAP_16(zp->zp_acl.z_acl_state);
+ zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 0000000000..229b042c4a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,936 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ */
+
+#include <fs/fs_subr.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/gfs.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/mount.h>
+
+typedef struct {
+ char *se_name;
+ vnode_t *se_root;
+ avl_node_t se_node;
+} zfs_snapentry_t;
+
+static int
+snapentry_compare(const void *a, const void *b)
+{
+ const zfs_snapentry_t *sa = a;
+ const zfs_snapentry_t *sb = b;
+ int ret = strcmp(sa->se_name, sb->se_name);
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+vnodeops_t *zfsctl_ops_root;
+vnodeops_t *zfsctl_ops_snapdir;
+vnodeops_t *zfsctl_ops_snapshot;
+
+static const fs_operation_def_t zfsctl_tops_root[];
+static const fs_operation_def_t zfsctl_tops_snapdir[];
+static const fs_operation_def_t zfsctl_tops_snapshot[];
+
+static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
+
+static gfs_opsvec_t zfsctl_opsvec[] = {
+ { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
+ { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
+ { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+ { NULL }
+};
+
+typedef struct zfsctl_node {
+ gfs_dir_t zc_gfs_private;
+ uint64_t zc_id;
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+ zfsctl_node_t sd_node;
+ kmutex_t sd_lock;
+ avl_tree_t sd_snaps;
+} zfsctl_snapdir_t;
+
+/*
+ * Root directory elements. We have only a single static entry, 'snapshot'.
+ */
+static gfs_dirent_t zfsctl_root_entries[] = {
+ { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+ { NULL }
+};
+
+/* include . and .. in the calculation */
+#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
+ sizeof (gfs_dirent_t)) + 1)
+
+
+/*
+ * Initialize the various GFS pieces we'll need to create and manipulate .zfs
+ * directories. This is called from the ZFS init routine, and initializes the
+ * vnode ops vectors that we'll be using.
+ */
+void
+zfsctl_init(void)
+{
+ VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+}
+
+void
+zfsctl_fini(void)
+{
+ /*
+ * Remove vfsctl vnode ops
+ */
+ if (zfsctl_ops_root)
+ vn_freevnodeops(zfsctl_ops_root);
+ if (zfsctl_ops_snapdir)
+ vn_freevnodeops(zfsctl_ops_snapdir);
+ if (zfsctl_ops_snapshot)
+ vn_freevnodeops(zfsctl_ops_snapshot);
+
+ zfsctl_ops_root = NULL;
+ zfsctl_ops_snapdir = NULL;
+ zfsctl_ops_snapshot = NULL;
+}
+
+/*
+ * Return the inode number associated with the 'snapshot' directory.
+ */
+/* ARGSUSED */
+static ino64_t
+zfsctl_root_inode_cb(vnode_t *vp, int index)
+{
+ ASSERT(index == 0);
+ return (ZFSCTL_INO_SNAPDIR);
+}
+
+/*
+ * Create the '.zfs' directory. This directory is cached as part of the VFS
+ * structure. This results in a hold on the vfs_t. The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1. This reference
+ * is removed when the ctldir is destroyed in the unmount.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ vnode_t *vp;
+ zfsctl_node_t *zcp;
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
+ zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
+ zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = ZFSCTL_INO_ROOT;
+
+ /*
+ * We're only faking the fact that we have a root of a filesystem for
+ * the sake of the GFS interfaces. Undo the flag manipulation it did
+ * for us.
+ */
+ vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+
+ zfsvfs->z_ctldir = vp;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is
+ * unmounted, and there are no more references. Release the vnode,
+ * which will release the hold on the vfs structure.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ ASSERT(zfsvfs->z_ctldir->v_count == 1);
+ VN_RELE(zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+vnode_t *
+zfsctl_root(znode_t *zp)
+{
+ ASSERT(zfs_has_ctldir(zp));
+ VN_HOLD(zp->z_zfsvfs->z_ctldir);
+ return (zp->z_zfsvfs->z_ctldir);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
+{
+ if (flags & FWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
+ cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+ if (mode & VWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ timestruc_t now;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purly virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now.
+ */
+ gethrestime(&now);
+ vap->va_mtime = vap->va_ctime = vap->va_atime = now;
+}
+
+static int
+zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_node_t *zcp = vp->v_data;
+ uint64_t object = zcp->zc_id;
+ zfid_short_t *zfid;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (fidp->fid_len < SHORT_FID_LEN) {
+ fidp->fid_len = SHORT_FID_LEN;
+ return (ENOSPC);
+ }
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs znodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+
+#define ZFSCTL_INO_SNAP(id) (id)
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+ ZFS_ENTER(zfsvfs);
+ vap->va_nodeid = ZFSCTL_INO_ROOT;
+ vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+
+ zfsctl_common_getattr(vp, vap);
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(nm, "..") == 0) {
+ err = VFS_ROOT(dvp->v_vfsp, vpp);
+ } else {
+ err = gfs_dir_lookup(dvp, nm, vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (err);
+}
+
+static const fs_operation_def_t zfsctl_tops_root[] = {
+ { VOPNAME_OPEN, zfsctl_common_open },
+ { VOPNAME_CLOSE, zfsctl_common_close },
+ { VOPNAME_IOCTL, fs_inval },
+ { VOPNAME_GETATTR, zfsctl_root_getattr },
+ { VOPNAME_ACCESS, zfsctl_common_access },
+ { VOPNAME_READDIR, gfs_vop_readdir },
+ { VOPNAME_LOOKUP, zfsctl_root_lookup },
+ { VOPNAME_SEEK, fs_seek },
+ { VOPNAME_INACTIVE, (fs_generic_func_p) gfs_vop_inactive },
+ { VOPNAME_FID, zfsctl_common_fid },
+ { NULL }
+};
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ (void) strcat(zname, "@");
+ if (strlen(zname) + strlen(name) >= len)
+ return (ENAMETOOLONG);
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ zfs_snapentry_t search, *sep;
+ avl_index_t where;
+ int err;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+
+ search.se_name = (char *)name;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
+ return (ENOENT);
+
+ ASSERT(vn_ismntpt(sep->se_root));
+
+ /* this will be dropped by dounmount() */
+ if ((err = vn_vfswlock(sep->se_root)) != 0)
+ return (err);
+
+ VN_HOLD(sep->se_root);
+ if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0)
+ return (err);
+ ASSERT(sep->se_root->v_count == 1);
+ gfs_vop_inactive(sep->se_root, cr);
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ return (0);
+}
+
+
+static int
+zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+{
+ avl_index_t where;
+ vfs_t *vfsp;
+ refstr_t *pathref;
+ char newpath[MAXNAMELEN];
+ const char *oldpath;
+ char *tail;
+ int err;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+ ASSERT(sep != NULL);
+
+ vfsp = vn_mountedvfs(sep->se_root);
+ ASSERT(vfsp != NULL);
+
+ if (err = vfs_lock(vfsp))
+ return (err);
+
+ /*
+ * Change the name in the AVL tree.
+ */
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ /*
+ * Change the current mountpoint info:
+ * - update the tail of the mntpoint path
+ * - update the tail of the resource path
+ */
+ pathref = vfs_getmntpoint(vfsp);
+ oldpath = refstr_value(pathref);
+ VERIFY((tail = strrchr(oldpath, '/')) != NULL);
+ ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+ (void) strncpy(newpath, oldpath, tail - oldpath + 1);
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setmntpoint(vfsp, newpath);
+
+ pathref = vfs_getresource(vfsp);
+ oldpath = refstr_value(pathref);
+ VERIFY((tail = strrchr(oldpath, '@')) != NULL);
+ ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+ (void) strncpy(newpath, oldpath, tail - oldpath + 1);
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setresource(vfsp, newpath);
+
+ vfs_unlock(vfsp);
+ return (0);
+}
+
+static int
+zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
+ cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = sdvp->v_data;
+ zfs_snapentry_t search, *sep;
+ avl_index_t where;
+ char from[MAXNAMELEN], to[MAXNAMELEN];
+ int err;
+
+ VERIFY(zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from) == 0);
+ err = zfs_secpolicy_write(from, NULL, cr);
+ if (err)
+ return (err);
+
+ /*
+ * Cannot move snapshots out of the snapdir.
+ */
+ if (sdvp != tdvp)
+ return (EINVAL);
+
+ if (strcmp(snm, tnm) == 0)
+ return (0);
+
+ mutex_enter(&sdp->sd_lock);
+
+ search.se_name = (char *)snm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+ err = zfsctl_rename_snap(sdp, sep, tnm);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ return (err);
+ }
+ }
+
+
+ VERIFY(zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to) == 0);
+ err = dmu_objset_rename(from, to);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ char snapname[MAXNAMELEN];
+ int err;
+
+ VERIFY(zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname) == 0);
+ err = zfs_secpolicy_write(snapname, NULL, cr);
+ if (err)
+ return (err);
+
+ mutex_enter(&sdp->sd_lock);
+
+ err = zfsctl_unmount_snap(dvp, name, 0, cr);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ return (err);
+ }
+
+ err = dmu_objset_destroy(snapname);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ */
+/* ARGSUSED */
+static int
+zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ objset_t *snap;
+ char snapname[MAXNAMELEN];
+ char *mountpoint;
+ zfs_snapentry_t *sep, search;
+ struct mounta margs;
+ vfs_t *vfsp;
+ size_t mountpoint_len;
+ avl_index_t where;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
+ return (0);
+
+ /*
+ * If we get a recursive call, that means we got called
+ * from the domount() code while it was trying to look up the
+ * spec (which looks like a local path for zfs). We need to
+ * add some flag to domount() to tell it not to do this lookup.
+ */
+ if (MUTEX_HELD(&sdp->sd_lock))
+ return (ENOENT);
+
+ ZFS_ENTER(zfsvfs);
+
+ mutex_enter(&sdp->sd_lock);
+ search.se_name = (char *)nm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+ *vpp = sep->se_root;
+ VN_HOLD(*vpp);
+ /*
+ * If the snapshot was unmounted behind our backs, remount it.
+ */
+ if (!vn_ismntpt(*vpp))
+ goto domount;
+ VERIFY(traverse(vpp) == 0);
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * The requested snapshot is not currently mounted, look it up.
+ */
+ VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
+ if (dmu_objset_open(snapname, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (ENOENT);
+ }
+
+ sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ dmu_objset_close(snap);
+domount:
+ mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
+ strlen("/.zfs/snapshot/") + strlen(nm) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
+ refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
+
+ margs.spec = snapname;
+ margs.dir = mountpoint;
+ margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
+ margs.fstype = "zfs";
+ margs.dataptr = NULL;
+ margs.datalen = 0;
+ margs.optptr = NULL;
+ margs.optlen = 0;
+
+ err = domount("zfs", &margs, *vpp, kcred, &vfsp);
+ ASSERT3U(err, ==, 0);
+
+ kmem_free(mountpoint, mountpoint_len);
+
+ VFS_RELE(vfsp);
+
+ /*
+ * Fix up the root vnode.
+ */
+ VERIFY(traverse(vpp) == 0);
+ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+ VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+ (*vpp)->v_vfsp = zfsvfs->z_vfs;
+ (*vpp)->v_flag &= ~VROOT;
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+ offset_t *offp, offset_t *nextp, void *data)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ char snapname[MAXNAMELEN];
+ uint64_t id, cookie;
+
+ ZFS_ENTER(zfsvfs);
+
+ cookie = *offp;
+ if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+ &cookie) == ENOENT) {
+ *eofp = 1;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ (void) strcpy(dp->d_name, snapname);
+ dp->d_ino = ZFSCTL_INO_SNAP(id);
+ *nextp = cookie;
+
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+vnode_t *
+zfsctl_mknode_snapdir(vnode_t *pvp)
+{
+ vnode_t *vp;
+ zfsctl_snapdir_t *sdp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
+ zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
+ zfsctl_snapdir_readdir_cb, NULL);
+ sdp = vp->v_data;
+ sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
+ mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&sdp->sd_snaps, snapentry_compare,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+ return (vp);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_snapdir_t *sdp = vp->v_data;
+
+ ZFS_ENTER(zfsvfs);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_nodeid = gfs_file_inode(vp);
+ vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+static void
+zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = vp->v_data;
+
+ ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
+ mutex_destroy(&sdp->sd_lock);
+ avl_destroy(&sdp->sd_snaps);
+ gfs_vop_inactive(vp, cr);
+}
+
+static const fs_operation_def_t zfsctl_tops_snapdir[] = {
+ { VOPNAME_OPEN, zfsctl_common_open },
+ { VOPNAME_CLOSE, zfsctl_common_close },
+ { VOPNAME_IOCTL, fs_inval },
+ { VOPNAME_GETATTR, zfsctl_snapdir_getattr },
+ { VOPNAME_ACCESS, zfsctl_common_access },
+ { VOPNAME_RENAME, zfsctl_snapdir_rename },
+ { VOPNAME_RMDIR, zfsctl_snapdir_remove },
+ { VOPNAME_READDIR, gfs_vop_readdir },
+ { VOPNAME_LOOKUP, zfsctl_snapdir_lookup },
+ { VOPNAME_SEEK, fs_seek },
+ { VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapdir_inactive },
+ { VOPNAME_FID, zfsctl_common_fid },
+ { NULL }
+};
+
+static vnode_t *
+zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+{
+ vnode_t *vp;
+ zfsctl_node_t *zcp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
+ zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = objset;
+
+ return (vp);
+}
+
+static void
+zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ vnode_t *dvp;
+
+ VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+
+ if (vp->v_count > 1) {
+ mutex_exit(&sdp->sd_lock);
+ return;
+ }
+ ASSERT(!vn_ismntpt(vp));
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ if (sep->se_root == vp) {
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+ break;
+ }
+ sep = next;
+ }
+ ASSERT(sep != NULL);
+
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ gfs_vop_inactive(vp, cr);
+}
+
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static const fs_operation_def_t zfsctl_tops_snapshot[] = {
+ VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
+ NULL, NULL
+};
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *vp;
+ zfsctl_snapdir_t *sdp;
+ zfsctl_node_t *zcp;
+ zfs_snapentry_t *sep;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, kcred);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ vp = sep->se_root;
+ zcp = vp->v_data;
+ if (zcp->zc_id == objsetid)
+ break;
+
+ sep = AVL_NEXT(&sdp->sd_snaps, sep);
+ }
+
+ if (sep != NULL) {
+ VN_HOLD(vp);
+ error = traverse(&vp);
+ if (error == 0)
+ *zfsvfsp = VTOZ(vp)->z_zfsvfs;
+ VN_RELE(vp);
+ } else {
+ error = EINVAL;
+ }
+
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ return (error);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *svp;
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, cr);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ svp = sep->se_root;
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ /*
+ * If this snapshot is not mounted, then it must
+ * have just been unmounted by somebody else, and
+ * will be cleaned up by zfsctl_snapdir_inactive().
+ */
+ if (vn_ismntpt(svp)) {
+ if ((error = vn_vfswlock(svp)) != 0)
+ goto out;
+
+ VN_HOLD(svp);
+ error = dounmount(vn_mountedvfs(svp), fflags, cr);
+ if (error) {
+ VN_RELE(svp);
+ goto out;
+ }
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ /*
+ * We can't use VN_RELE(), as that will try to
+ * invoke zfsctl_snapdir_inactive(), and that
+ * would lead to an attempt to re-grab the sd_lock.
+ */
+ ASSERT3U(svp->v_count, ==, 1);
+ gfs_vop_inactive(svp, cr);
+ }
+ sep = next;
+ }
+out:
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 0000000000..6df89ad0c4
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include "fs/fs_subr.h"
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Lock a directory entry. A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object. As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZSHARED: allow concurrent access with other ZSHARED callers.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ * dlpp - pointer to the dirlock for this entry (NULL on error)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t *dl;
+ uint64_t zoid;
+ int error;
+
+ *zpp = NULL;
+ *dlpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+ zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+ return (EEXIST);
+
+ /*
+ * Wait until there are no locks on this name.
+ */
+ mutex_enter(&dzp->z_lock);
+ for (;;) {
+ if (dzp->z_reap) {
+ mutex_exit(&dzp->z_lock);
+ return (ENOENT);
+ }
+ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
+ if (strcmp(name, dl->dl_name) == 0)
+ break;
+ if (dl == NULL) {
+ /*
+ * Allocate a new dirlock and add it to the list.
+ */
+ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+ dl->dl_name = name;
+ dl->dl_sharecnt = 0;
+ dl->dl_namesize = 0;
+ dl->dl_dzp = dzp;
+ dl->dl_next = dzp->z_dirlocks;
+ dzp->z_dirlocks = dl;
+ break;
+ }
+ if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+ break;
+ cv_wait(&dl->dl_cv, &dzp->z_lock);
+ }
+
+ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+ /*
+ * We're the second shared reference to dl. Make a copy of
+ * dl_name in case the first thread goes away before we do.
+ * Note that we initialize the new name before storing its
+ * pointer into dl_name, because the first thread may load
+ * dl->dl_name at any time. He'll either see the old value,
+ * which is his, or the new shared copy; either is OK.
+ */
+ dl->dl_namesize = strlen(dl->dl_name) + 1;
+ name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+ bcopy(dl->dl_name, name, dl->dl_namesize);
+ dl->dl_name = name;
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * We have a dirlock on the name. (Note that it is the dirlock,
+ * not the dzp's z_lock, that protects the name in the zap object.)
+ * See if there's an object by this name; if so, put a hold on it.
+ */
+ if (flag & ZXATTR) {
+ zoid = dzp->z_phys->zp_xattr;
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ return (EEXIST);
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ }
+
+ *dlpp = dl;
+
+ return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfs_dirlock_t **prev_dl, *cur_dl;
+
+ mutex_enter(&dzp->z_lock);
+ if (dl->dl_sharecnt > 1) {
+ dl->dl_sharecnt--;
+ mutex_exit(&dzp->z_lock);
+ return;
+ }
+ prev_dl = &dzp->z_dirlocks;
+ while ((cur_dl = *prev_dl) != dl)
+ prev_dl = &cur_dl->dl_next;
+ *prev_dl = dl->dl_next;
+ cv_broadcast(&dl->dl_cv);
+ mutex_exit(&dzp->z_lock);
+
+ if (dl->dl_namesize != 0)
+ kmem_free(dl->dl_name, dl->dl_namesize);
+ cv_destroy(&dl->dl_cv);
+ kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ * no directory entries are actually stored for them. If this is
+ * the root of a filesystem, then '.zfs' is also treated as a
+ * special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+{
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ int error = 0;
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *vpp = ZTOV(dzp);
+ VN_HOLD(*vpp);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", vpp, NULL, 0, NULL, kcred);
+ return (error);
+ }
+ rw_enter(&dzp->z_parent_lock, RW_READER);
+ error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+ rw_exit(&dzp->z_parent_lock);
+ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+ *vpp = zfsctl_root(dzp);
+ } else {
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ zfs_dirent_unlock(dl);
+ }
+ }
+
+ return (error);
+}
+
+static char *
+zfs_dq_hexname(char namebuf[17], uint64_t x)
+{
+ char *name = &namebuf[16];
+ const char digits[16] = "0123456789abcdef";
+
+ *name = '\0';
+ do {
+ *--name = digits[x & 0xf];
+ x >>= 4;
+ } while (x != 0);
+
+ return (name);
+}
+
+void
+zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ char obj_name[17];
+ int error;
+
+ ASSERT(zp->z_reap);
+ ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+ error = zap_add(zfsvfs->z_os, zfsvfs->z_dqueue,
+ zfs_dq_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t dl;
+ int skipped = 0;
+ int error;
+
+ ASSERT(dzp->z_active == 0);
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &xzp);
+ ASSERT3U(error, ==, 0);
+
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, -1);
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ VN_RELE(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+ bzero(&dl, sizeof (dl));
+ dl.dl_dzp = dzp;
+ dl.dl_name = zap.za_name;
+
+ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_commit(tx);
+
+ VN_RELE(ZTOV(xzp));
+ }
+ ASSERT(error == ENOENT);
+ return (skipped);
+}
+
+/*
+ * Special function to requeue the znodes for deletion that were
+ * in progress when we either crashed or umounted the file system.
+ */
+static void
+zfs_drain_dq(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ int error;
+
+ /*
+ * Interate over the contents of the delete queue.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_dqueue);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * Need some helpers?
+ */
+ if (zfs_delete_thread_target(zfsvfs, -1) != 0)
+ return;
+
+ /*
+ * See what kind of object we have in queue
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these queue entries for reaping,
+ * so we pull them back into core and set zp->z_reap.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for reaping.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system to be processed by the taskq.
+ */
+ if (error != 0) {
+ continue;
+ }
+ zp->z_reap = 1;
+ VN_RELE(ZTOV(zp));
+ break;
+ }
+}
+
+void
+zfs_delete_thread(void *arg)
+{
+ zfsvfs_t *zfsvfs = arg;
+ zfs_delete_t *zd = &zfsvfs->z_delete_head;
+ znode_t *zp;
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, &zd->z_mutex, callb_generic_cpr, "zfs_delete");
+
+ mutex_enter(&zd->z_mutex);
+
+ if (!zd->z_drained && !zd->z_draining) {
+ zd->z_draining = B_TRUE;
+ mutex_exit(&zd->z_mutex);
+ zfs_drain_dq(zfsvfs);
+ mutex_enter(&zd->z_mutex);
+ zd->z_draining = B_FALSE;
+ zd->z_drained = B_TRUE;
+ cv_broadcast(&zd->z_quiesce_cv);
+ }
+
+ while (zd->z_thread_count <= zd->z_thread_target) {
+ zp = list_head(&zd->z_znodes);
+ if (zp == NULL) {
+ ASSERT(zd->z_znode_count == 0);
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ cv_wait(&zd->z_cv, &zd->z_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &zd->z_mutex);
+ continue;
+ }
+ ASSERT(zd->z_znode_count != 0);
+ list_remove(&zd->z_znodes, zp);
+ if (--zd->z_znode_count == 0)
+ cv_broadcast(&zd->z_quiesce_cv);
+ mutex_exit(&zd->z_mutex);
+ zfs_rmnode(zp);
+ (void) zfs_delete_thread_target(zfsvfs, -1);
+ mutex_enter(&zd->z_mutex);
+ }
+
+ ASSERT(zd->z_thread_count != 0);
+ if (--zd->z_thread_count == 0)
+ cv_broadcast(&zd->z_cv);
+
+ CALLB_CPR_EXIT(&cprinfo); /* NB: drops z_mutex */
+ thread_exit();
+}
+
+static int zfs_work_per_thread_shift = 11; /* 2048 (2^11) per thread */
+
+/*
+ * Set the target number of delete threads to 'nthreads'.
+ * If nthreads == -1, choose a number based on current workload.
+ * If nthreads == 0, don't return until the threads have exited.
+ */
+int
+zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads)
+{
+ zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+ mutex_enter(&zd->z_mutex);
+
+ if (nthreads == -1) {
+ if (zd->z_thread_target == 0) {
+ mutex_exit(&zd->z_mutex);
+ return (EBUSY);
+ }
+ nthreads = zd->z_znode_count >> zfs_work_per_thread_shift;
+ nthreads = MIN(nthreads, ncpus << 1);
+ nthreads = MAX(nthreads, 1);
+ nthreads += !!zd->z_draining;
+ }
+
+ zd->z_thread_target = nthreads;
+
+ while (zd->z_thread_count < zd->z_thread_target) {
+ (void) thread_create(NULL, 0, zfs_delete_thread, zfsvfs,
+ 0, &p0, TS_RUN, minclsyspri);
+ zd->z_thread_count++;
+ }
+
+ while (zd->z_thread_count > zd->z_thread_target && nthreads == 0) {
+ cv_broadcast(&zd->z_cv);
+ cv_wait(&zd->z_cv, &zd->z_mutex);
+ }
+
+ mutex_exit(&zd->z_mutex);
+
+ return (0);
+}
+
+/*
+ * Wait until everything that's been queued has been deleted.
+ */
+void
+zfs_delete_wait_empty(zfsvfs_t *zfsvfs)
+{
+ zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+ mutex_enter(&zd->z_mutex);
+ ASSERT(zd->z_thread_target != 0);
+ while (!zd->z_drained || zd->z_znode_count != 0) {
+ ASSERT(zd->z_thread_target != 0);
+ cv_wait(&zd->z_quiesce_cv, &zd->z_mutex);
+ }
+ mutex_exit(&zd->z_mutex);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *xzp = NULL;
+ char obj_name[17];
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ int error;
+
+ ASSERT(zp->z_active == 0);
+ ASSERT(ZTOV(zp)->v_count == 0);
+ ASSERT(zp->z_phys->zp_links == 0);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR))
+ if (zfs_purgedir(zp) != 0) {
+ zfs_delete_t *delq = &zfsvfs->z_delete_head;
+ /*
+ * Add this back to the delete list to be retried later.
+ *
+ * XXX - this could just busy loop on us...
+ */
+ mutex_enter(&delq->z_mutex);
+ list_insert_tail(&delq->z_znodes, zp);
+ delq->z_znode_count++;
+ mutex_exit(&delq->z_mutex);
+ return;
+ }
+
+ /*
+ * If the file has extended attributes, unlink the xattr dir.
+ */
+ if (zp->z_phys->zp_xattr) {
+ error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ ASSERT(error == 0);
+ }
+
+ acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+ /*
+ * Set up the transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+ if (xzp) {
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ }
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_delete_t *delq = &zfsvfs->z_delete_head;
+
+ dmu_tx_abort(tx);
+ /*
+ * Add this back to the delete list to be retried later.
+ *
+ * XXX - this could just busy loop on us...
+ */
+ mutex_enter(&delq->z_mutex);
+ list_insert_tail(&delq->z_znodes, zp);
+ delq->z_znode_count++;
+ mutex_exit(&delq->z_mutex);
+ return;
+ }
+
+ if (xzp) {
+ dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_reap = 1; /* mark xzp for deletion */
+ xzp->z_phys->zp_links = 0; /* no more links to it */
+ mutex_exit(&xzp->z_lock);
+ zfs_dq_add(xzp, tx); /* add xzp to delete queue */
+ }
+
+ /*
+ * Remove this znode from delete queue
+ */
+ error = zap_remove(os, zfsvfs->z_dqueue,
+ zfs_dq_hexname(obj_name, zp->z_id), tx);
+ ASSERT3U(error, ==, 0);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
+}
+
+/*
+ * Link zp into dl. Can only fail if zp has been reaped.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ int error;
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_reap) { /* no new links to reaped zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ mutex_exit(&zp->z_lock);
+ return (ENOENT);
+ }
+ zp->z_phys->zp_links++;
+ }
+ zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+
+ if (!(flag & ZNEW))
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ mutex_exit(&zp->z_lock);
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size++; /* one dirent added */
+ dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+ 8, 1, &zp->z_id, tx);
+ ASSERT(error == 0);
+
+ return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for reaping if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'reaped_ptr' is NULL, we put reaped znodes on the delete queue.
+ * If it's non-NULL, we use it to indicate whether the znode needs reaping,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+ int *reaped_ptr)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ int reaped = 0;
+ int error;
+
+ if (!(flag & ZRENAMING)) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ if (vn_vfswlock(vp)) /* prevent new mounts on zp */
+ return (EBUSY);
+
+ if (vn_ismntpt(vp)) { /* don't remove mount point */
+ vn_vfsunlock(vp);
+ return (EBUSY);
+ }
+
+ mutex_enter(&zp->z_lock);
+ if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ return (EEXIST);
+ }
+ ASSERT(zp->z_phys->zp_links > zp_is_dir);
+ if (--zp->z_phys->zp_links == zp_is_dir) {
+ zp->z_reap = 1;
+ zp->z_phys->zp_links = 0;
+ reaped = 1;
+ } else {
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ }
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size--; /* one dirent removed */
+ dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+ ASSERT(error == 0);
+
+ if (reaped_ptr != NULL)
+ *reaped_ptr = reaped;
+ else if (reaped)
+ zfs_dq_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty. Works with or without z_lock
+ * held, but can only be consider a hint in the latter case. Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ uint64_t xoid;
+ int error;
+
+ *xvpp = NULL;
+
+ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+ return (error);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
+ ASSERT(xzp->z_id == xoid);
+ ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_phys->zp_xattr = xoid;
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+ dmu_tx_commit(tx);
+
+ *xvpp = ZTOV(xzp);
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ zfs_dirlock_t *dl;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xvpp = ZTOV(xzp);
+ zfs_dirent_unlock(dl);
+ return (0);
+ }
+
+ ASSERT(zp->z_phys->zp_xattr == 0);
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ zfs_dirent_unlock(dl);
+ return (EROFS);
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | 0755;
+ va.va_uid = (uid_t)zp->z_phys->zp_uid;
+ va.va_gid = (gid_t)zp->z_phys->zp_gid;
+
+ error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+ zfs_dirent_unlock(dl);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+
+ if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
+ return (0);
+
+ if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
+ (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
+ uid == zp->z_phys->zp_uid ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(cr));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 0000000000..e8723ffe89
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,1323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/mount.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+
+#include "zfs_namecheck.h"
+
+extern struct modlfs zfs_modlfs;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+ldi_ident_t zfs_li = NULL;
+dev_info_t *zfs_dip;
+
+typedef int zfs_ioc_func_t(zfs_cmd_t *);
+typedef int zfs_secpolicy_func_t(const char *, const char *, cred_t *);
+
+typedef struct zfs_ioc_vec {
+ zfs_ioc_func_t *zvec_func;
+ zfs_secpolicy_func_t *zvec_secpolicy;
+ enum {
+ no_name,
+ pool_name,
+ dataset_name
+ } zvec_namecheck;
+} zfs_ioc_vec_t;
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char buf[256];
+ va_list adx;
+
+ /*
+ * Get rid of annoying "../common/" prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ /*
+ * To get this data, use the zfs-dprintf probe as so:
+ * dtrace -q -n 'zfs-dprintf \
+ * /stringof(arg0) == "dbuf.c"/ \
+ * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
+ * arg0 = file name
+ * arg1 = function name
+ * arg2 = line number
+ * arg3 = message
+ */
+ DTRACE_PROBE4(zfs__dprintf,
+ char *, newfile, char *, func, int, line, char *, buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools). Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(const char *unused1, const char *unused2, cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics). Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(const char *dataset, const char *unused, cred_t *cr)
+{
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(dataset, NULL))
+ return (0);
+
+ return (ENOENT);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+ int writable = 1;
+
+ /*
+ * The dataset must be visible by this zone -- check this first
+ * so they don't see EPERM on something they shouldn't know about.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(dataset, &writable))
+ return (ENOENT);
+
+ if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
+ return (ENOENT);
+
+ if (INGLOBALZONE(curproc)) {
+ /*
+ * If the fs is zoned, only root can access it from the
+ * global zone.
+ */
+ if (secpolicy_zfs(cr) && zoned)
+ return (EPERM);
+ } else {
+ /*
+ * If we are in a local zone, the 'zoned' property must be set.
+ */
+ if (!zoned)
+ return (EPERM);
+
+ /* must be writable by this zone */
+ if (!writable)
+ return (EPERM);
+ }
+ return (0);
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+/* ARGSUSED */
+int
+zfs_secpolicy_write(const char *dataset, const char *unused, cred_t *cr)
+{
+ int error;
+
+ if (error = zfs_dozonecheck(dataset, cr))
+ return (error);
+
+ return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for operations that want to write a dataset's parent:
+ * create, destroy, snapshot, clone, restore.
+ */
+static int
+zfs_secpolicy_parent(const char *dataset, const char *unused, cred_t *cr)
+{
+ char parentname[MAXNAMELEN];
+ char *cp;
+
+ /*
+ * Remove the @bla or /bla from the end of the name to get the parent.
+ */
+ (void) strncpy(parentname, dataset, sizeof (parentname));
+ cp = strrchr(parentname, '@');
+ if (cp != NULL) {
+ cp[0] = '\0';
+ } else {
+ cp = strrchr(parentname, '/');
+ if (cp == NULL)
+ return (ENOENT);
+ cp[0] = '\0';
+
+ }
+
+ return (zfs_secpolicy_write(parentname, unused, cr));
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+static int
+zfs_secpolicy_setprop(const char *dataset, const char *prop, cred_t *cr)
+{
+ int error;
+
+ if (error = zfs_dozonecheck(dataset, cr))
+ return (error);
+
+ if (strcmp(prop, "zoned") == 0) {
+ /*
+ * Disallow setting of 'zoned' from within a local zone.
+ */
+ if (!INGLOBALZONE(curproc))
+ return (EPERM);
+ }
+
+ return (secpolicy_zfs(cr));
+}
+
+/*
+ * Security policy for setting the quota. This is the same as
+ * zfs_secpolicy_write, except that the local zone may not change the quota at
+ * the zone-property setpoint.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_quota(const char *dataset, const char *unused, cred_t *cr)
+{
+ int error;
+
+ if (error = zfs_dozonecheck(dataset, cr))
+ return (error);
+
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+ char setpoint[MAXNAMELEN];
+ int dslen;
+ /*
+ * Unprivileged users are allowed to modify the quota
+ * on things *under* (ie. contained by) the thing they
+ * own.
+ */
+ if (dsl_prop_get_integer(dataset, "zoned", &zoned, setpoint))
+ return (EPERM);
+ if (!zoned) /* this shouldn't happen */
+ return (EPERM);
+ dslen = strlen(dataset);
+ if (dslen <= strlen(setpoint))
+ return (EPERM);
+ }
+
+ return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
+{
+ if (secpolicy_sys_config(cr, B_FALSE) != 0)
+ return (EPERM);
+
+ return (0);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_config(zfs_cmd_t *zc, nvlist_t **nvp)
+{
+ char *packed;
+ size_t size;
+ int error;
+ nvlist_t *config = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist. By this point, we know
+ * that the user has the SYS_CONFIG privilege, so allocating arbitrary
+ * sized regions of memory should not be a problem.
+ */
+ if ((size = zc->zc_config_src_size) == 0)
+ return (EINVAL);
+
+ packed = kmem_alloc(size, KM_SLEEP);
+
+ if ((error = xcopyin((void *)(uintptr_t)zc->zc_config_src, packed,
+ size)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ kmem_free(packed, size);
+
+ *nvp = config;
+ return (0);
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+
+ if ((error = get_config(zc, &config)) != 0)
+ return (error);
+
+ error = spa_create(zc->zc_name, config, zc->zc_root[0] == '\0' ?
+ NULL : zc->zc_root);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+ return (spa_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+ uint64_t guid;
+
+ if ((error = get_config(zc, &config)) != 0)
+ return (error);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+ guid != zc->zc_pool_guid)
+ error = EINVAL;
+ else
+ error = spa_import(zc->zc_name, config,
+ zc->zc_root[0] == '\0' ? NULL : zc->zc_root);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+ return (spa_export(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+ nvlist_t *configs;
+ char *packed = NULL;
+ size_t size = 0;
+ int error;
+
+ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+ return (EEXIST);
+
+ VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+ if (size > zc->zc_config_dst_size)
+ error = ENOMEM;
+ else
+ error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+ size);
+
+ zc->zc_config_dst_size = size;
+
+ kmem_free(packed, size);
+ nvlist_free(configs);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_guid(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ zc->zc_pool_guid = spa_guid(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+ nvlist_t *config;
+ char *packed = NULL;
+ size_t size = 0;
+ int error;
+
+ error = spa_get_stats(zc->zc_name, &config);
+
+ if (config != NULL) {
+ VERIFY(nvlist_pack(config, &packed, &size,
+ NV_ENCODE_NATIVE, 0) == 0);
+
+ if (size > zc->zc_config_dst_size)
+ error = ENOMEM;
+ else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+ size))
+ error = EFAULT;
+
+ zc->zc_config_dst_size = size;
+
+ kmem_free(packed, size);
+ nvlist_free(config);
+ } else {
+ ASSERT(error != 0);
+ }
+
+ return (error);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+ nvlist_t *tryconfig, *config;
+ char *packed = NULL;
+ size_t size = 0;
+ int error;
+
+ if ((error = get_config(zc, &tryconfig)) != 0)
+ return (error);
+
+ config = spa_tryimport(tryconfig);
+
+ nvlist_free(tryconfig);
+
+ if (config == NULL)
+ return (EINVAL);
+
+ VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+ if (size > zc->zc_config_dst_size)
+ error = ENOMEM;
+ else
+ error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+ size);
+
+ zc->zc_config_dst_size = size;
+
+ kmem_free(packed, size);
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ spa_freeze(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *config;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ if ((error = get_config(zc, &config)) == 0) {
+ error = spa_vdev_add(spa, config);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+ return (ENOTSUP);
+}
+
+static int
+zfs_ioc_vdev_online(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_prop_value;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = vdev_online(spa, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_offline(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_prop_value;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = vdev_offline(spa, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_prop_value;
+ int replacing = zc->zc_cookie;
+ nvlist_t *config;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ if ((error = get_config(zc, &config)) == 0) {
+ error = spa_vdev_attach(spa, path, config, replacing);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_prop_value;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_detach(spa, path, 0, B_FALSE);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_get_stats(zfs_cmd_t *zc)
+{
+ char *name = zc->zc_name;
+ zfs_stats_t *zs = &zc->zc_zfs_stats;
+ int error;
+
+ bzero(zs, sizeof (zfs_stats_t));
+
+ if ((error = dsl_prop_get_integer(name, "atime",
+ &zs->zs_atime, zs->zs_atime_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "recordsize",
+ &zs->zs_recordsize, zs->zs_recordsize_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "readonly",
+ &zs->zs_readonly, zs->zs_readonly_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "devices",
+ &zs->zs_devices, zs->zs_devices_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "setuid",
+ &zs->zs_setuid, zs->zs_setuid_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "exec",
+ &zs->zs_exec, zs->zs_exec_setpoint)) != 0 ||
+ (error = dsl_prop_get_string(name, "mountpoint", zs->zs_mountpoint,
+ sizeof (zs->zs_mountpoint), zs->zs_mountpoint_setpoint)) != 0 ||
+ (error = dsl_prop_get_string(name, "sharenfs", zs->zs_sharenfs,
+ sizeof (zs->zs_sharenfs), zs->zs_sharenfs_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "aclmode",
+ &zs->zs_acl_mode, zs->zs_acl_mode_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "snapdir",
+ &zs->zs_snapdir, zs->zs_snapdir_setpoint)) != 0 ||
+ (error = dsl_prop_get_integer(name, "aclinherit",
+ &zs->zs_acl_inherit, zs->zs_acl_inherit_setpoint)) != 0)
+ return (error);
+
+ return (0);
+}
+
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ return (error);
+ }
+
+ dmu_objset_stats(os, &zc->zc_objset_stats);
+
+ switch (zc->zc_objset_stats.dds_type) {
+
+ case DMU_OST_ZFS:
+ error = zfs_get_stats(zc);
+ break;
+
+ case DMU_OST_ZVOL:
+ error = zvol_get_stats(zc, os);
+ break;
+ }
+
+ dmu_objset_close(os);
+ return (error);
+}
+
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+ dsl_dir_t *dd;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+ int error;
+ char *p;
+
+ dd = dsl_dir_open(zc->zc_name, FTAG, NULL);
+ if (dd == NULL)
+ return (ESRCH);
+
+ if (dd->dd_phys->dd_child_dir_zapobj == 0) {
+ dsl_dir_close(dd, FTAG);
+ return (ESRCH);
+ }
+
+ p = strrchr(zc->zc_name, '/');
+ if (p == NULL || p[1] != '\0')
+ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+ p = zc->zc_name + strlen(zc->zc_name);
+
+ do {
+ zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj, zc->zc_cookie);
+
+ error = zap_cursor_retrieve(&cursor, &attr);
+ if (error == ENOENT)
+ error = ESRCH;
+ if (error != 0) {
+ dsl_dir_close(dd, FTAG);
+ *p = '\0';
+ return (error);
+ }
+
+ (void) strlcpy(p, attr.za_name, sizeof (zc->zc_name) -
+ (p - zc->zc_name));
+
+ zap_cursor_advance(&cursor);
+ zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+ } while (!INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(zc->zc_name, NULL));
+
+ dsl_dir_close(dd, FTAG);
+
+ /*
+ * If it's a hidden dataset, don't try to get stats for it.
+ * User land will skip over it.
+ */
+ if (strchr(zc->zc_name, '$') != NULL)
+ return (0);
+
+ error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+ return (error);
+}
+
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+ dsl_dataset_t *ds;
+ int error;
+
+retry:
+ error = dsl_dataset_open(zc->zc_name,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+ if (error) {
+ /*
+ * This is ugly: dsl_dataset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list -s".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ if (error == ENOENT)
+ return (ESRCH);
+ return (error);
+ }
+
+ /*
+ * If ds_snapnames_zapobj is 0, someone is trying to iterate over
+ * snapshots of a snapshot. In this case, pretend that it has no
+ * snapshots; otherwise zap_cursor_retrieve() will blow up.
+ */
+ if (ds->ds_phys->ds_snapnames_zapobj == 0) {
+ error = ESRCH;
+ goto out;
+ }
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, zc->zc_cookie);
+
+ error = zap_cursor_retrieve(&cursor, &attr);
+ if (error == ENOENT)
+ error = ESRCH;
+ if (error != 0)
+ goto out;
+
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+ sizeof (zc->zc_name) ||
+ strlcat(zc->zc_name, attr.za_name, sizeof (zc->zc_name)) >=
+ sizeof (zc->zc_name)) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+
+ zap_cursor_advance(&cursor);
+ zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+ error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+
+out:
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+ return (dsl_prop_set(zc->zc_name, zc->zc_prop_name,
+ zc->zc_intsz, zc->zc_numints, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_set_quota(zfs_cmd_t *zc)
+{
+ return (dsl_dir_set_quota(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_reservation(zfs_cmd_t *zc)
+{
+ return (dsl_dir_set_reservation(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_volsize(zfs_cmd_t *zc)
+{
+ return (zvol_set_volsize(zc));
+}
+
+static int
+zfs_ioc_set_volblocksize(zfs_cmd_t *zc)
+{
+ return (zvol_set_volblocksize(zc));
+}
+
+static int
+zfs_ioc_create_minor(zfs_cmd_t *zc)
+{
+ return (zvol_create_minor(zc));
+}
+
+static int
+zfs_ioc_remove_minor(zfs_cmd_t *zc)
+{
+ return (zvol_remove_minor(zc));
+}
+
+/*
+ * Search the vfs list for a specified resource. Returns a pointer to it
+ * or NULL if no suitable entry is found. The caller of this routine
+ * is responsible for releasing the returned vfs pointer.
+ */
+static vfs_t *
+zfs_get_vfs(const char *resource)
+{
+ struct vfs *vfsp;
+ struct vfs *vfs_found = NULL;
+
+ vfs_list_read_lock();
+ vfsp = rootvfs;
+ do {
+ if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
+ VFS_HOLD(vfsp);
+ vfs_found = vfsp;
+ break;
+ }
+ vfsp = vfsp->vfs_next;
+ } while (vfsp != rootvfs);
+ vfs_list_unlock();
+ return (vfs_found);
+}
+
+static void
+zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_cmd_t *zc = arg;
+ zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+}
+
+static int
+zfs_ioc_create(zfs_cmd_t *zc)
+{
+ objset_t *clone;
+ int error = 0;
+ void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ dmu_objset_type_t type = zc->zc_objset_type;
+
+ switch (type) {
+
+ case DMU_OST_ZFS:
+ cbfunc = zfs_create_cb;
+ break;
+
+ case DMU_OST_ZVOL:
+ cbfunc = zvol_create_cb;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ if (zc->zc_filename[0] != '\0') {
+ /*
+ * We're creating a clone of an existing snapshot.
+ */
+ zc->zc_filename[sizeof (zc->zc_filename) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_filename, NULL, NULL) != 0)
+ return (EINVAL);
+
+ error = dmu_objset_open(zc->zc_filename, type,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+ if (error)
+ return (error);
+ error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
+ dmu_objset_close(clone);
+ } else if (strchr(zc->zc_name, '@') != 0) {
+ /*
+ * We're taking a snapshot of an existing dataset.
+ */
+ error = dmu_objset_create(zc->zc_name, type, NULL, NULL, NULL);
+ } else {
+ /*
+ * We're creating a new dataset.
+ */
+ if (type == DMU_OST_ZVOL) {
+ if ((error = zvol_check_volsize(zc)) != 0)
+ return (error);
+ if ((error = zvol_check_volblocksize(zc)) != 0)
+ return (error);
+ }
+ error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, zc);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+ if (strchr(zc->zc_name, '@') != NULL &&
+ zc->zc_objset_type == DMU_OST_ZFS) {
+ vfs_t *vfsp;
+ int err;
+
+ /*
+ * Snapshots under .zfs control must be unmounted
+ * before they can be destroyed.
+ */
+ if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+ /*
+ * Always force the unmount for snapshots.
+ */
+ int flag = MS_FORCE;
+
+ if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+ VFS_RELE(vfsp);
+ if ((err = dounmount(vfsp, flag, kcred)) != 0)
+ return (err);
+ }
+ }
+
+ return (dmu_objset_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_rollback(zfs_cmd_t *zc)
+{
+ return (dmu_objset_rollback(zc->zc_name));
+}
+
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+ zc->zc_prop_value[sizeof (zc->zc_prop_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_prop_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ if (strchr(zc->zc_name, '@') != NULL &&
+ zc->zc_objset_type == DMU_OST_ZFS) {
+ vfs_t *vfsp;
+ int err;
+
+ /*
+ * Snapshots under .zfs control must be unmounted
+ * before they can be renamed.
+ */
+ if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+ /*
+ * Always force the unmount for snapshots.
+ */
+ int flag = MS_FORCE;
+
+ if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+ VFS_RELE(vfsp);
+ if ((err = dounmount(vfsp, flag, kcred)) != 0)
+ return (err);
+ }
+ }
+
+ return (dmu_objset_rename(zc->zc_name, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_recvbackup(zfs_cmd_t *zc)
+{
+ file_t *fp;
+ int error, fd;
+
+ fd = zc->zc_cookie;
+ fp = getf(fd);
+ if (fp == NULL)
+ return (EBADF);
+ error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
+ fp->f_vnode, fp->f_offset);
+ releasef(fd);
+ return (error);
+}
+
+static int
+zfs_ioc_sendbackup(zfs_cmd_t *zc)
+{
+ objset_t *fromsnap = NULL;
+ objset_t *tosnap;
+ file_t *fp;
+ int error;
+
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+ if (error)
+ return (error);
+
+ if (zc->zc_prop_value[0] != '\0') {
+ error = dmu_objset_open(zc->zc_prop_value, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+ if (error) {
+ dmu_objset_close(tosnap);
+ return (error);
+ }
+ }
+
+ fp = getf(zc->zc_cookie);
+ if (fp == NULL) {
+ dmu_objset_close(tosnap);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ return (EBADF);
+ }
+
+ error = dmu_sendbackup(tosnap, fromsnap, fp->f_vnode);
+
+ releasef(zc->zc_cookie);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ dmu_objset_close(tosnap);
+ return (error);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[] = {
+ { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name },
+ { zfs_ioc_pool_guid, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name },
+ { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_set_prop, zfs_secpolicy_setprop, dataset_name },
+ { zfs_ioc_set_quota, zfs_secpolicy_quota, dataset_name },
+ { zfs_ioc_set_reservation, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_set_volsize, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_set_volblocksize, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_create, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_rename, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_sendbackup, zfs_secpolicy_write, dataset_name },
+};
+
+static int
+zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+ zfs_cmd_t *zc;
+ uint_t vec;
+ int error;
+
+ if (getminor(dev) != 0)
+ return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
+
+ vec = cmd - ZFS_IOC;
+
+ if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+ return (EINVAL);
+
+ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+ error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
+
+ if (error == 0) {
+ zc->zc_cred = (uintptr_t)cr;
+ zc->zc_dev = dev;
+ error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name,
+ zc->zc_prop_name, cr);
+ }
+
+ /*
+ * Ensure that all pool/dataset names are valid before we pass down to
+ * the lower layers.
+ */
+ if (error == 0) {
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ switch (zfs_ioc_vec[vec].zvec_namecheck) {
+ case pool_name:
+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+
+ case dataset_name:
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+ }
+ }
+
+ if (error == 0)
+ error = zfs_ioc_vec[vec].zvec_func(zc);
+
+ if (error == 0 || error == ENOMEM) {
+ int rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
+ if (error == 0)
+ error = rc;
+ }
+
+ kmem_free(zc, sizeof (zfs_cmd_t));
+ return (error);
+}
+
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+ DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ zfs_dip = dip;
+
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy())
+ return (DDI_FAILURE);
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ zfs_dip = NULL;
+
+ ddi_prop_remove_all(dip);
+ ddi_remove_minor_node(dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = zfs_dip;
+ return (DDI_SUCCESS);
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)getminor((dev_t)arg);
+ return (DDI_SUCCESS);
+ }
+
+ return (DDI_FAILURE);
+}
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+static struct cb_ops zfs_cb_ops = {
+ zvol_open, /* open */
+ zvol_close, /* close */
+ zvol_strategy, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ zvol_read, /* read */
+ zvol_write, /* write */
+ zfsdev_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */
+ CB_REV, /* version */
+ zvol_aread, /* async read */
+ zvol_awrite, /* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+ DEVO_REV, /* version */
+ 0, /* refcnt */
+ zfs_info, /* info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ zfs_attach, /* attach */
+ zfs_detach, /* detach */
+ nodev, /* reset */
+ &zfs_cb_ops, /* driver operations */
+ NULL /* no bus operations */
+};
+
+static struct modldrv zfs_modldrv = {
+ &mod_driverops, "ZFS storage pool version 1", &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&zfs_modlfs,
+ (void *)&zfs_modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ if ((error = mod_install(&modlinkage)) != 0)
+ return (error);
+
+ error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+ ASSERT(error == 0);
+
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+
+ return (0);
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ if (spa_busy() || zfs_busy() || zvol_busy())
+ return (EBUSY);
+
+ if ((error = mod_remove(&modlinkage)) != 0)
+ return (error);
+
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+
+ ldi_ident_release(zfs_li);
+ zfs_li = NULL;
+
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 0000000000..dbfd87f67a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/ddi.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * a intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * transactions.
+ */
+uint64_t
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_rdev = zp->z_phys->zp_rdev;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+uint64_t
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_remove_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_remove_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+uint64_t
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_link_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_link_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_link_obj = zp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+uint64_t
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+ size_t linksize = strlen(link) + 1;
+
+ if (zilog == NULL)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ bcopy(name, (char *)(lr + 1), namesize);
+ bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+uint64_t
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zilog == NULL)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ bcopy(sname, (char *)(lr + 1), snamesize);
+ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ sdzp->z_last_itx = seq;
+ tdzp->z_last_itx = seq;
+ szp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ *
+ * We store data in the log buffers if it small enough.
+ * Otherwise we flush the data out via dmu_sync().
+ */
+ssize_t zfs_immediate_write_sz = 65536;
+
+uint64_t
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_write_t *lr;
+ int dlen, err;
+
+ if (zilog == NULL || zp->z_reap)
+ return (0);
+
+ dlen = (len <= zfs_immediate_write_sz ? len : 0);
+ itx = zil_itx_create(txtype, sizeof (*lr) + dlen);
+ itx->itx_data_copied = 0;
+ if ((ioflag & FDSYNC) && (dlen != 0)) {
+ err = xcopyin(uio->uio_iov->iov_base - len,
+ (char *)itx + offsetof(itx_t, itx_lr) + sizeof (*lr),
+ len);
+ /*
+ * copyin shouldn't fault as we've already successfully
+ * copied it to a dmu buffer. However if it does we'll get
+ * the data from the dmu later.
+ */
+ if (!err)
+ itx->itx_data_copied = 1;
+ }
+ lr = (lr_write_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zp->z_zfsvfs;
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+uint64_t
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_truncate_t *lr;
+
+ if (zilog == NULL || zp->z_reap)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+uint64_t
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_setattr_t *lr;
+
+ if (zilog == NULL || zp->z_reap)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_setattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mask = (uint64_t)mask_applied;
+ lr->lr_mode = (uint64_t)vap->va_mode;
+ lr->lr_uid = (uint64_t)vap->va_uid;
+ lr->lr_gid = (uint64_t)vap->va_gid;
+ lr->lr_size = (uint64_t)vap->va_size;
+ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+ return (seq);
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+uint64_t
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_acl_t *lr;
+
+ if (zilog == NULL || zp->z_reap)
+ return (0);
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+ lr = (lr_acl_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_aclcnt = (uint64_t)aclcnt;
+ bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+ return (seq);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 0000000000..cd5a3848cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+ bzero(vap, sizeof (*vap));
+ vap->va_mask = (uint_t)mask;
+ vap->va_type = IFTOVT(mode);
+ vap->va_mode = mode & MODEMASK;
+ vap->va_uid = (uid_t)uid;
+ vap->va_gid = (gid_t)gid;
+ vap->va_rdev = (dev_t)rdev;
+ vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+ return (ENOTSUP);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_create_t */
+ char *link; /* symlink content follows name */
+ znode_t *dzp;
+ vnode_t *vp = NULL;
+ vattr_t va;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time and generation number. The generic VOP_CREATE()
+ * doesn't have either concept, so we smuggle the values inside
+ * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ */
+ ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
+ va.va_nblocks = lr->lr_gen;
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_CREATE:
+ error = VOP_CREATE(ZTOV(dzp), name, &va, 0, 0, &vp, kcred, 0);
+ break;
+ case TX_MKDIR:
+ error = VOP_MKDIR(ZTOV(dzp), name, &va, &vp, kcred);
+ break;
+ case TX_MKXATTR:
+ error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+ break;
+ case TX_SYMLINK:
+ link = name + strlen(name) + 1;
+ error = VOP_SYMLINK(ZTOV(dzp), name, &va, link, kcred);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+
+ if (error == 0 && vp != NULL)
+ VN_RELE(vp);
+
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_remove_t */
+ znode_t *dzp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_REMOVE:
+ error = VOP_REMOVE(ZTOV(dzp), name, kcred);
+ break;
+ case TX_RMDIR:
+ error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_link_t */
+ znode_t *dzp, *zp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+ VN_RELE(ZTOV(dzp));
+ return (error);
+ }
+
+ error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred);
+
+ VN_RELE(ZTOV(zp));
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ znode_t *sdzp, *tdzp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+ VN_RELE(ZTOV(sdzp));
+ return (error);
+ }
+
+ error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred);
+
+ VN_RELE(ZTOV(tdzp));
+ VN_RELE(ZTOV(sdzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ znode_t *zp;
+ int error;
+ ssize_t resid;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+ lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ flock64_t fl;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&fl, sizeof (fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = 0;
+ fl.l_start = lr->lr_offset;
+ fl.l_len = lr->lr_length;
+
+ error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+ lr->lr_offset, kcred, NULL);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ vattr_t va;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+ va.va_size = lr->lr_size;
+ ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
+ ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+
+ error = VOP_SETATTR(ZTOV(zp), &va, 0, kcred, NULL);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
+ vsecattr_t vsa;
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_ace_byteswap(ace, lr->lr_aclcnt);
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentp = ace;
+
+ error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+ zfs_replay_error, /* 0 no such transaction type */
+ zfs_replay_create, /* TX_CREATE */
+ zfs_replay_create, /* TX_MKDIR */
+ zfs_replay_create, /* TX_MKXATTR */
+ zfs_replay_create, /* TX_SYMLINK */
+ zfs_replay_remove, /* TX_REMOVE */
+ zfs_replay_remove, /* TX_RMDIR */
+ zfs_replay_link, /* TX_LINK */
+ zfs_replay_rename, /* TX_RENAME */
+ zfs_replay_write, /* TX_WRITE */
+ zfs_replay_truncate, /* TX_TRUNCATE */
+ zfs_replay_setattr, /* TX_SETATTR */
+ zfs_replay_acl, /* TX_ACL */
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 0000000000..502bcf39bf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,1072 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_znode.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/varargs.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/mkdev.h>
+#include <sys/modctl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+
+int zfsfstype;
+vfsops_t *zfs_vfsops = NULL;
+static major_t zfs_major;
+static minor_t zfs_minor;
+static kmutex_t zfs_dev_mtx;
+
+static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
+static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
+static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
+static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
+static void zfs_freevfs(vfs_t *vfsp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+
+static const fs_operation_def_t zfs_vfsops_template[] = {
+ VFSNAME_MOUNT, zfs_mount,
+ VFSNAME_UNMOUNT, zfs_umount,
+ VFSNAME_ROOT, zfs_root,
+ VFSNAME_STATVFS, zfs_statvfs,
+ VFSNAME_SYNC, (fs_generic_func_p) zfs_sync,
+ VFSNAME_VGET, zfs_vget,
+ VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+ NULL, NULL
+};
+
+static const fs_operation_def_t zfs_vfsops_eio_template[] = {
+ VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+ NULL, NULL
+};
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
+static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
+
+static mntopt_t mntopts[] = {
+ { MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL },
+ { MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL },
+ { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
+};
+
+static mntopts_t zfs_mntopts = {
+ sizeof (mntopts) / sizeof (mntopt_t),
+ mntopts
+};
+
+/*ARGSUSED*/
+int
+zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
+{
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (panicstr)
+ return (0);
+
+ /*
+ * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
+ * to sync metadata, which they would otherwise cache indefinitely.
+ * Semantically, the only requirement is that the sync be initiated.
+ * The DMU syncs out txgs frequently, so there's nothing to do.
+ */
+ if (flag & SYNC_ATTR)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ ZFS_ENTER(zfsvfs);
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC);
+ else
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval < SPA_MINBLOCKSIZE ||
+ newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
+ newval = SPA_MAXBLOCKSIZE;
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->vfs_bsize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ (void) zfs_delete_thread_target(zfsvfs, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ (void) zfs_delete_thread_target(zfsvfs, 1);
+ }
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = NULL;
+ znode_t *zp = NULL;
+ vnode_t *vp = NULL;
+ objset_t *os = NULL;
+ struct dsl_dataset *ds;
+ char *osname;
+ uint64_t readonly, recordsize;
+ pathname_t spn;
+ dev_t mount_dev;
+ major_t new_major;
+ int mode;
+ int error = 0;
+ uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
+ UIO_SYSSPACE : UIO_USERSPACE;
+ int canwrite;
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_REMOUNT) == 0 &&
+ (uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * ZFS does not support passing unparsed data in via MS_DATA.
+ * Users should use the MS_OPTIONSTR interface; this means
+ * that all option parsing is already done and the options struct
+ * can be interrogated.
+ */
+ if ((uap->flags & MS_DATA) && uap->datalen > 0)
+ return (EINVAL);
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (uap->flags & MS_REMOUNT) {
+ zfsvfs = vfsp->vfs_data;
+
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ if (dmu_objset_is_snapshot(zfsvfs->z_os))
+ return (EROFS);
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ devices_changed_cb(zfsvfs, B_FALSE);
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ devices_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+ devices_changed_cb(zfsvfs, B_TRUE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_TRUE);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ exec_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ exec_changed_cb(zfsvfs, B_TRUE);
+
+ return (0);
+ }
+
+ /*
+ * Get the objset name (the "special" mount argument).
+ */
+ if (error = pn_get(uap->spec, fromspace, &spn))
+ return (error);
+
+ osname = spn.pn_path;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ goto out;
+
+ /*
+ * Refuse to mount a filesystem if we are in a local zone and the
+ * dataset is not visible.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+ error = EPERM;
+ goto out;
+ }
+
+ /*
+ * Initialize the zfs-specific filesystem structure.
+ * Should probably make this a kmem cache, shuffle fields,
+ * and just bzero upto z_hold_mtx[].
+ */
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_vfs = vfsp;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_assign = TXG_NOWAIT;
+ zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = VISIBLE;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+
+ /*
+ * Initialize the generic filesystem structure.
+ */
+ vfsp->vfs_bcount = 0;
+ vfsp->vfs_data = NULL;
+
+ /*
+ * Create a unique device for the mount.
+ */
+ do {
+ ASSERT3U(zfs_minor, <=, MAXMIN32);
+ int start = zfs_minor;
+ do {
+ mutex_enter(&zfs_dev_mtx);
+ zfs_minor++;
+ if (zfs_minor > MAXMIN32)
+ zfs_minor = 0;
+ mount_dev = makedevice(zfs_major, zfs_minor);
+ mutex_exit(&zfs_dev_mtx);
+ } while (vfs_devismounted(mount_dev) && zfs_minor != start);
+ if (zfs_minor == start) {
+ /*
+ * We are using all ~262,000 minor numbers
+ * for the current major number. Create a
+ * new major number.
+ */
+ if ((new_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_mount: Can't get unique"
+ " major device number.");
+ goto out;
+ }
+ mutex_enter(&zfs_dev_mtx);
+ zfs_major = new_major;
+ zfs_minor = 0;
+ mutex_exit(&zfs_dev_mtx);
+ } else {
+ break;
+ }
+ /* CONSTANTCONDITION */
+ } while (1);
+
+ ASSERT(vfs_devismounted(mount_dev) == 0);
+
+ if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
+ recordsize = SPA_MAXBLOCKSIZE;
+
+ vfsp->vfs_dev = mount_dev;
+ vfsp->vfs_fstype = zfsfstype;
+ vfsp->vfs_bsize = recordsize;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfsp->vfs_data = zfsvfs;
+
+ error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
+ if (error)
+ goto out;
+
+ if (readonly)
+ mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+ else
+ mode = DS_MODE_PRIMARY;
+
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ if (error == EROFS) {
+ mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
+ &zfsvfs->z_os);
+ }
+ os = zfsvfs->z_os;
+
+ if (error)
+ goto out;
+
+ if (error = zfs_init_fs(zfsvfs, &zp, cr))
+ goto out;
+
+ if (dmu_objset_is_snapshot(os)) {
+ ASSERT(mode & DS_MODE_READONLY);
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ zfsvfs->z_issnap = B_TRUE;
+ } else {
+ int do_readonly = FALSE, readonly;
+ int do_setuid = FALSE, setuid;
+ int do_exec = FALSE, exec;
+ int do_devices = FALSE, devices;
+
+ /*
+ * Start a delete thread running.
+ */
+ (void) zfs_delete_thread_target(zfsvfs, 1);
+
+ /*
+ * Parse and replay the intent log.
+ */
+ zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
+ (void (*)(void *))zfs_delete_wait_empty);
+
+ if (!zil_disable)
+ zfsvfs->z_log = zil_open(os, zfs_get_data);
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ devices = B_FALSE;
+ setuid = B_FALSE;
+ do_devices = B_TRUE;
+ do_setuid = B_TRUE;
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+ devices = B_FALSE;
+ do_devices = B_TRUE;
+ } else if (vfs_optionisset(vfsp,
+ MNTOPT_DEVICES, NULL)) {
+ devices = B_TRUE;
+ do_devices = B_TRUE;
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+
+ /*
+ * Register property callbacks.
+ */
+ ds = dmu_objset_ds(os);
+ VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_register(ds, "aclinherit",
+ acl_inherit_changed_cb, zfsvfs) == 0);
+
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_devices)
+ devices_changed_cb(zfsvfs, devices);
+ }
+
+ vp = ZTOV(zp);
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ if (zp)
+ VN_RELE(vp);
+
+ if (zfsvfs) {
+ if (os)
+ dmu_objset_close(os);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ }
+ } else {
+ atomic_add_32(&zfs_active_fs_count, 1);
+ VN_RELE(vp);
+ }
+
+ pn_free(&spn);
+ return (error);
+}
+
+static int
+zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ dmu_objset_stats_t dstats;
+ dev32_t d32;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_stats(zfsvfs->z_os, &dstats);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
+ statp->f_bsize = zfsvfs->z_max_blksz;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks =
+ (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT;
+ statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree);
+ statp->f_favail = statp->f_ffree; /* no "root reservation" */
+ statp->f_files = statp->f_ffree + dstats.dds_objects_used;
+
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ statp->f_fsid = d32;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
+
+ statp->f_flag = 0;
+
+ statp->f_namemax = ZFS_MAXNAMELEN;
+
+ /*
+ * We have all of 32 characters to stuff a string here.
+ * Is there anything useful we could/should provide?
+ */
+ bzero(statp->f_fstr, sizeof (statp->f_fstr));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *vpp = ZTOV(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ int ret;
+
+ if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (ret);
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL &&
+ (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+
+ if (fflag & MS_FORCE) {
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ zfsvfs->z_unmounted1 = B_TRUE;
+
+ /*
+ * Wait for all zfs threads to leave zfs.
+ * Grabbing a rwlock as reader in all vops and
+ * as writer here doesn't work because it too easy to get
+ * multiple reader enters as zfs can re-enter itself.
+ * This can lead to deadlock if there is an intervening
+ * rw_enter as writer.
+ * So a file system threads ref count (z_op_cnt) is used.
+ * A polling loop on z_op_cnt may seem inefficient, but
+ * - this saves all threads on exit from having to grab a
+ * mutex in order to cv_signal
+ * - only occurs on forced unmount in the rare case when
+ * there are outstanding threads within the file system.
+ */
+ while (zfsvfs->z_op_cnt) {
+ delay(1);
+ }
+
+ zfs_objset_close(zfsvfs);
+
+ return (0);
+ }
+
+ zfs_zcache_flush(zfsvfs);
+
+ /*
+ * Stop all delete threads.
+ */
+ (void) zfs_delete_thread_target(zfsvfs, 0);
+
+ /*
+ * Check the number of active vnodes in the file system.
+ * Our count is maintained in the vfs structure, but the number
+ * is off by 1 to indicate a hold on the vfs structure itself.
+ *
+ * The '.zfs' directory maintains a reference of its own, and any active
+ * references underneath are reflected in the vnode count.
+ */
+ if (zfsvfs->z_ctldir == NULL) {
+ if (vfsp->vfs_count > 1) {
+ if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+ (void) zfs_delete_thread_target(zfsvfs, 1);
+ return (EBUSY);
+ }
+ } else {
+ if (vfsp->vfs_count > 2 ||
+ (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) {
+ if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+ (void) zfs_delete_thread_target(zfsvfs, 1);
+ return (EBUSY);
+ }
+ }
+
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ zfs_objset_close(zfsvfs);
+
+ /*
+ * We can now safely destroy the '.zfs' directory node, which will
+ * release its hold on the vfs_t.
+ */
+ if (zfsvfs->z_ctldir != NULL)
+ zfsctl_destroy(zfsvfs);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (EINVAL);
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /* A zero fid_gen means we are in the .zfs control directories */
+ if (fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+ *vpp = zfsvfs->z_ctldir;
+ ASSERT(*vpp != NULL);
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
+ 0, NULL, NULL) == 0);
+ } else {
+ VN_HOLD(*vpp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if (err = zfs_zget(zfsvfs, object, &zp)) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ zp_gen = zp->z_phys->zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_reap || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ VN_RELE(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ *vpp = ZTOV(zp);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static void
+zfs_objset_close(zfsvfs_t *zfsvfs)
+{
+ zfs_delete_t *zd = &zfsvfs->z_delete_head;
+ znode_t *zp, *nextzp;
+ objset_t *os = zfsvfs->z_os;
+ struct dsl_dataset *ds;
+
+ /*
+ * Stop all delete threads.
+ */
+ (void) zfs_delete_thread_target(zfsvfs, 0);
+
+ /*
+ * For forced unmount, at this point all vops except zfs_inactive
+ * are erroring EIO. We need to now suspend zfs_inactive threads
+ * while we are freeing dbufs before switching zfs_inactive
+ * to use behaviour without a objset.
+ */
+ rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+
+ zfs_zcache_flush(zfsvfs);
+
+ /*
+ * Release all delete in progress znodes
+ * They will be processed when the file system remounts.
+ */
+ mutex_enter(&zd->z_mutex);
+ while (zp = list_head(&zd->z_znodes)) {
+ list_remove(&zd->z_znodes, zp);
+ zp->z_dbuf_held = 0;
+ dmu_buf_rele(zp->z_dbuf);
+ }
+ mutex_exit(&zd->z_mutex);
+
+ /*
+ * Release all holds on dbufs
+ * Note, although we have stopped all other vop threads and
+ * zfs_inactive(), the dmu can callback via znode_pageout_func()
+ * which can zfs_znode_free() the znode.
+ * So we lock z_all_znodes; search the list for a held
+ * dbuf; drop the lock (we know zp can't disappear if we hold
+ * a dbuf lock; then regrab the lock and restart.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
+ nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+ if (zp->z_dbuf_held) {
+ /* dbufs should only be held when force unmounting */
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ dmu_buf_rele(zp->z_dbuf);
+ /* Start again */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ nextzp = list_head(&zfsvfs->z_all_znodes);
+ }
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os)) {
+ ds = dmu_objset_ds(os);
+
+ VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclinherit",
+ acl_inherit_changed_cb, zfsvfs) == 0);
+ }
+
+ /*
+ * Make the dmu drop all it dbuf holds so that zfs_inactive
+ * can then safely free znode/vnodes.
+ */
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ /*
+ * Switch zfs_inactive to behaviour without an objset.
+ * It just tosses cached pages and frees the znode & vnode.
+ * Then re-enable zfs_inactive threads in that new behaviour.
+ */
+ zfsvfs->z_unmounted2 = B_TRUE;
+ rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+
+ /*
+ * Close the zil. Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ /*
+ * Finally close the objset
+ */
+ dmu_objset_close(os);
+
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+ atomic_add_32(&zfs_active_fs_count, -1);
+}
+
+/*
+ * VFS_INIT() initialization. Note that there is no VFS_FINI(),
+ * so we can't safely do any non-idempotent initialization here.
+ * Leave that to zfs_init() and zfs_fini(), which are called
+ * from the module's _init() and _fini() entry points.
+ */
+/*ARGSUSED*/
+static int
+zfs_vfsinit(int fstype, char *name)
+{
+ int error;
+
+ zfsfstype = fstype;
+
+ /*
+ * Setup vfsops and vnodeops tables.
+ */
+ error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
+ if (error != 0) {
+ cmn_err(CE_WARN, "zfs: bad vfs ops template");
+ }
+
+ error = zfs_create_op_tables();
+ if (error) {
+ zfs_remove_op_tables();
+ cmn_err(CE_WARN, "zfs: bad vnode ops template");
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ return (error);
+ }
+
+ mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * unique major number for all zfs mounts
+ */
+ if ((zfs_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_vfsinit: Can't get unique device number.");
+ zfs_remove_op_tables();
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ return (error);
+ }
+ zfs_minor = 0;
+
+ return (0);
+}
+
+void
+zfs_init(void)
+{
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+}
+
+void
+zfs_fini(void)
+{
+ zfsctl_fini();
+ zfs_znode_fini();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ MNTTYPE_ZFS,
+ zfs_vfsinit,
+ VSW_HASPROTO | VSW_CANRWRO | VSW_CANREMOUNT | VSW_VOLATILEDEV,
+ &zfs_mntopts
+};
+
+struct modlfs zfs_modlfs = {
+ &mod_fsops, "ZFS filesystem version 1", &vfw
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 0000000000..eb9964aa20
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,3663 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <vm/seg_vn.h>
+#include <vm/pvn.h>
+#include <vm/as.h>
+#include <sys/mman.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h> /* temporary for debugging purposes */
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait the the intent log to commit if it's is a synchronous operation.
+ * Morover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which requires z_grow_lock) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ *
+ * (3) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
+ * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
+ * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
+ * This is critical because we don't want to block while holding locks.
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing to
+ * use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call txg_wait_open(), and try again.
+ *
+ * (4) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ *
+ * (5) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (6) After dropping all locks, invoke zil_commit(zilog, seq, ioflag)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
+ * if (error) {
+ * dmu_tx_abort(tx); // abort DMU tx
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ * txg_wait_open(dmu_objset_pool(os), 0);
+ * goto top;
+ * }
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * seq = zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, seq, ioflag); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+ /*
+ * Clean up any locks held by this process on the vp.
+ */
+ cleanlocks(vp, ddi_get_pid(), 0);
+ cleanshares(vp, ddi_get_pid());
+
+ return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, int cmd, offset_t *off)
+{
+ znode_t *zp = VTOZ(vp);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ rw_enter(&zp->z_grow_lock, RW_READER);
+ file_sz = zp->z_phys->zp_size;
+ if (noff >= file_sz) {
+ rw_exit(&zp->z_grow_lock);
+ return (ENXIO);
+ }
+
+ if (cmd == _FIO_SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+ rw_exit(&zp->z_grow_lock);
+
+ /* end of file? */
+ if ((error == ESRCH) || (noff > file_sz)) {
+ /*
+ * Handle the virtual hole at the end of file.
+ */
+ if (hole) {
+ *off = file_sz;
+ return (0);
+ }
+ return (ENXIO);
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
+ int *rvalp)
+{
+ offset_t off;
+ int error;
+ zfsvfs_t *zfsvfs;
+
+ switch (com) {
+ case _FIOFFS:
+ return (zfs_sync(vp->v_vfsp, 0, cred));
+
+ case _FIO_SEEK_DATA:
+ case _FIO_SEEK_HOLE:
+ if (ddi_copyin((void *)data, &off, sizeof (off), flag))
+ return (EFAULT);
+
+ zfsvfs = VTOZ(vp)->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+
+ /* offset parameter is in/out */
+ error = zfs_holey(vp, com, &off);
+ ZFS_EXIT(zfsvfs);
+ if (error)
+ return (error);
+ if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
+ return (EFAULT);
+ return (0);
+ }
+ return (ENOTTY);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int64_t start, off;
+ int len = nbytes;
+ int error = 0;
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ page_t *pp;
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+ /*
+ * We don't want a new page to "appear" in the middle of
+ * the file update (because it may not get the write
+ * update data), so we grab a lock to block
+ * zfs_getpage().
+ */
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ if (pp = page_lookup(vp, start, SE_SHARED)) {
+ caddr_t va;
+
+ rw_exit(&zp->z_map_lock);
+ va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+ error = uiomove(va+off, bytes, UIO_WRITE, uio);
+ if (error == 0) {
+ dmu_write(zfsvfs->z_os, zp->z_id,
+ woff, bytes, va+off, tx);
+ }
+ ppmapout(va);
+ page_unlock(pp);
+ } else {
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+ woff, bytes, uio, tx);
+ rw_exit(&zp->z_map_lock);
+ }
+ len -= bytes;
+ woff += bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio)
+{
+ int64_t start, off, bytes;
+ int len = nbytes;
+ int error = 0;
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ page_t *pp;
+
+ bytes = MIN(PAGESIZE - off, len);
+ if (pp = page_lookup(vp, start, SE_SHARED)) {
+ caddr_t va;
+
+ va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+ ppmapout(va);
+ page_unlock(pp);
+ } else {
+ /* XXX use dmu_read here? */
+ error = uiomove(addr, bytes, UIO_READ, uio);
+ }
+ len -= bytes;
+ addr += bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: vp - vnode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - SYNC flags; used to provide FRSYNC semantics.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Side Effects:
+ * vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t delta;
+ ssize_t n, size, cnt, ndone;
+ int error, i, numbufs;
+ dmu_buf_t *dbp, **dbpp;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * Check for region locks
+ */
+ if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (error = chklock(vp, FREAD,
+ uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ */
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, ioflag & FRSYNC);
+
+ /*
+ * Make sure nobody restructures the file (changes block size)
+ * in the middle of the read.
+ */
+ rw_enter(&zp->z_grow_lock, RW_READER);
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ cnt = 0;
+ error = 0;
+ goto out;
+ }
+
+ cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+ for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) {
+ ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+ n = MIN(zfs_read_chunk_size,
+ zp->z_phys->zp_size - uio->uio_loffset);
+ n = MIN(n, cnt);
+ dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+ uio->uio_loffset, n, &numbufs);
+ if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
+ dmu_buf_rele_array(dbpp, numbufs);
+ goto out;
+ }
+ /*
+ * Compute the adjustment to align the dmu buffers
+ * with the uio buffer.
+ */
+ delta = uio->uio_loffset - dbpp[0]->db_offset;
+
+ for (i = 0; i < numbufs; i++) {
+ if (n < 0)
+ break;
+ dbp = dbpp[i];
+ size = dbp->db_size - delta;
+ /*
+ * XXX -- this is correct, but may be suboptimal.
+ * If the pages are all clean, we don't need to
+ * go through mappedread(). Maybe the VMODSORT
+ * stuff can help us here.
+ */
+ if (vn_has_cached_data(vp)) {
+ error = mappedread(vp, (caddr_t)dbp->db_data +
+ delta, (n < size ? n : size), uio);
+ } else {
+ error = uiomove((caddr_t)dbp->db_data + delta,
+ (n < size ? n : size), UIO_READ, uio);
+ }
+ if (error) {
+ dmu_buf_rele_array(dbpp, numbufs);
+ goto out;
+ }
+ n -= dbp->db_size;
+ if (delta) {
+ n += delta;
+ delta = 0;
+ }
+ }
+ dmu_buf_rele_array(dbpp, numbufs);
+ }
+out:
+ rw_exit(&zp->z_grow_lock);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified.
+ * Any error will exit this routine as this is only a best
+ * attempt to get the pages resident. This is a copy of ufs_trans_touch().
+ */
+static void
+zfs_prefault_write(ssize_t n, struct uio *uio)
+{
+ struct iovec *iov;
+ ulong_t cnt, incr;
+ caddr_t p;
+ uint8_t tmp;
+
+ iov = uio->uio_iov;
+
+ while (n) {
+ cnt = MIN(iov->iov_len, n);
+ if (cnt == 0) {
+ /* empty iov entry */
+ iov++;
+ continue;
+ }
+ n -= cnt;
+ /*
+ * touch each page in this segment.
+ */
+ p = iov->iov_base;
+ while (cnt) {
+ switch (uio->uio_segflg) {
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ if (fuword8(p, &tmp))
+ return;
+ break;
+ case UIO_SYSSPACE:
+ if (kcopy(p, &tmp, 1))
+ return;
+ break;
+ }
+ incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+ }
+ /*
+ * touch the last byte in case it straddles a page.
+ */
+ p--;
+ switch (uio->uio_segflg) {
+ case UIO_USERSPACE:
+ case UIO_USERISPACE:
+ if (fuword8(p, &tmp))
+ return;
+ break;
+ case UIO_SYSSPACE:
+ if (kcopy(p, &tmp, 1))
+ return;
+ break;
+ }
+ iov++;
+ }
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: vp - vnode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - FAPPEND flag set if in append mode.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated if byte count > 0
+ *
+ * Note: zfs_write() holds z_append_lock across calls to txg_wait_open().
+ * It has to because of the semantics of FAPPEND. The implication is that
+ * we must never grab z_append_lock while in an assigned tx.
+ */
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ rlim64_t limit = uio->uio_llimit;
+ ssize_t start_resid = uio->uio_resid;
+ ssize_t tx_bytes;
+ uint64_t end_size;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ offset_t woff;
+ ssize_t n, nbytes;
+ int max_blksz = zfsvfs->z_max_blksz;
+ int need_append_lock, error;
+ krw_t grow_rw = RW_READER;
+
+ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+ limit = MAXOFFSET_T;
+
+ n = start_resid;
+
+ /*
+ * Fasttrack empty write
+ */
+ if (n == 0)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages don't hold up txg
+ */
+ zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio);
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ need_append_lock = ioflag & FAPPEND;
+ if (need_append_lock) {
+ rw_enter(&zp->z_append_lock, RW_WRITER);
+ woff = uio->uio_loffset = zp->z_phys->zp_size;
+ } else {
+ woff = uio->uio_loffset;
+ /*
+ * Validate file offset
+ */
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * If this write could change the file length,
+ * we need to synchronize with "appenders".
+ */
+ if (woff < limit - n && woff + n > zp->z_phys->zp_size) {
+ need_append_lock = TRUE;
+ rw_enter(&zp->z_append_lock, RW_READER);
+ }
+ }
+
+ if (woff >= limit) {
+ error = EFBIG;
+ goto no_tx_done;
+ }
+
+ if ((woff + n) > limit || woff > (limit - n))
+ n = limit - woff;
+
+ /*
+ * Check for region locks
+ */
+ if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+ (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0)
+ goto no_tx_done;
+top:
+ /*
+ * Make sure nobody restructures the file (changes block size)
+ * in the middle of the write.
+ */
+ rw_enter(&zp->z_grow_lock, grow_rw);
+
+ end_size = MAX(zp->z_phys->zp_size, woff + n);
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ rw_exit(&zp->z_grow_lock);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ goto no_tx_done;
+ }
+
+ if (end_size > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) {
+ uint64_t new_blksz;
+ /*
+ * This write will increase the file size beyond
+ * the current block size so increase the block size.
+ */
+ if (grow_rw == RW_READER && !rw_tryupgrade(&zp->z_grow_lock)) {
+ dmu_tx_commit(tx);
+ rw_exit(&zp->z_grow_lock);
+ grow_rw = RW_WRITER;
+ goto top;
+ }
+ if (zp->z_blksz > max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ error = zfs_grow_blocksize(zp, new_blksz, tx);
+ if (error) {
+ tx_bytes = 0;
+ goto tx_done;
+ }
+ }
+
+ if (grow_rw == RW_WRITER) {
+ rw_downgrade(&zp->z_grow_lock);
+ grow_rw = RW_READER;
+ }
+
+ /*
+ * The file data does not fit in the znode "cache", so we
+ * will be writing to the file block data buffers.
+ * Each buffer will be written in a separate transaction;
+ * this keeps the intent log records small and allows us
+ * to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+ rw_enter(&zp->z_map_lock, RW_READER);
+
+ tx_bytes = uio->uio_resid;
+ if (vn_has_cached_data(vp)) {
+ rw_exit(&zp->z_map_lock);
+ error = mappedwrite(vp, woff, nbytes, uio, tx);
+ } else {
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+ woff, nbytes, uio, tx);
+ rw_exit(&zp->z_map_lock);
+ }
+ tx_bytes -= uio->uio_resid;
+
+ if (error) {
+ /* XXX - do we need to "clean up" the dmu buffer? */
+ break;
+ }
+
+ ASSERT(tx_bytes == nbytes);
+
+ n -= nbytes;
+ if (n <= 0)
+ break;
+
+ /*
+ * We have more work ahead of us, so wrap up this transaction
+ * and start another. Exact same logic as tx_done below.
+ */
+ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ uio->uio_loffset);
+ }
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+ ioflag, uio);
+ dmu_tx_commit(tx);
+
+ /* Pre-fault the next set of pages */
+ zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio);
+
+ /*
+ * Start another transaction.
+ */
+ woff = uio->uio_loffset;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ rw_exit(&zp->z_grow_lock);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ goto no_tx_done;
+ }
+ }
+
+tx_done:
+
+ if (tx_bytes != 0) {
+ /*
+ * Update the file size if it has changed; account
+ * for possible concurrent updates.
+ */
+ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ uio->uio_loffset);
+ }
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+ ioflag, uio);
+ }
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_grow_lock);
+
+no_tx_done:
+
+ if (need_append_lock)
+ rw_exit(&zp->z_append_lock);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ zil_commit(zilog, seq, ioflag & (FSYNC | FDSYNC));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t off = lr->lr_offset;
+ int dlen = lr->lr_length; /* length of user data */
+ int reclen = lr->lr_common.lrc_reclen;
+ int error = 0;
+
+ ASSERT(dlen != 0);
+
+ /*
+ * Nothing to do if the file has been removed or truncated.
+ */
+ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ return (ENOENT);
+ if (off >= zp->z_phys->zp_size || zp->z_reap) {
+ VN_RELE(ZTOV(zp));
+ return (ENOENT);
+ }
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
+ rw_enter(&zp->z_grow_lock, RW_READER);
+ dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
+ dmu_buf_read(db);
+ bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
+ dmu_buf_rele(db);
+ rw_exit(&zp->z_grow_lock);
+ } else {
+ /*
+ * We have to grab z_grow_lock as RW_WRITER because
+ * dmu_sync() can't handle concurrent dbuf_dirty() (6313856).
+ * z_grow_lock will be replaced with a range lock soon,
+ * which will eliminate the concurrency hit, but dmu_sync()
+ * really needs more thought. It shouldn't have to rely on
+ * the caller to provide MT safety.
+ */
+ rw_enter(&zp->z_grow_lock, RW_WRITER);
+ txg_suspend(dmu_objset_pool(os));
+ error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff,
+ &lr->lr_blkptr, lr->lr_common.lrc_txg);
+ txg_resume(dmu_objset_pool(os));
+ rw_exit(&zp->z_grow_lock);
+ }
+ VN_RELE(ZTOV(zp));
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_zaccess_rwx(zp, mode, cr);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr)
+{
+
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ *vpp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+
+ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+ VN_RELE(*vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+
+ if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
+
+ /*
+ * Convert device special files
+ */
+ if (IS_DEVVP(*vpp)) {
+ vnode_t *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL)
+ error = ENOSYS;
+ else
+ *vpp = svp;
+ }
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+/* ARGSUSED */
+static int
+zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
+ int mode, vnode_t **vpp, cred_t *cr, int flag)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ objset_t *os = zfsvfs->z_os;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t zoid;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ *vpp = NULL;
+
+ if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~VSVTX;
+
+ if (*name == '\0') {
+ /*
+ * Null component name refers to the directory itself.
+ */
+ VN_HOLD(dvp);
+ zp = dzp;
+ dl = NULL;
+ error = 0;
+ } else {
+ /* possible VN_HOLD(zp) */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+ if (strcmp(name, "..") == 0)
+ error = EISDIR;
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ zoid = zp ? zp->z_id : -1ULL;
+
+ if (zp == NULL) {
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+ if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+ ASSERT(zp->z_id == zoid);
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+ seq = zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+ dmu_tx_commit(tx);
+ } else {
+ /*
+ * A directory entry already exists for this name.
+ */
+ /*
+ * Can't truncate an existing file if in exclusive mode.
+ */
+ if (excl == EXCL) {
+ error = EEXIST;
+ goto out;
+ }
+ /*
+ * Can't open a directory for writing.
+ */
+ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
+ error = EISDIR;
+ goto out;
+ }
+ /*
+ * Verify requested access to file.
+ */
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+ goto out;
+ }
+ /*
+ * Truncate regular files if requested.
+ */
+
+ /*
+ * Need to update dzp->z_seq?
+ */
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_seq++;
+ mutex_exit(&dzp->z_lock);
+
+ if ((ZTOV(zp)->v_type == VREG) && (zp->z_phys->zp_size != 0) &&
+ (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+ /*
+ * Truncate the file.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, zoid);
+ dmu_tx_hold_free(tx, zoid, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (dl)
+ zfs_dirent_unlock(dl);
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ /*
+ * Grab the grow_lock to serialize this change with
+ * respect to other file manipulations.
+ */
+ rw_enter(&zp->z_grow_lock, RW_WRITER);
+ error = zfs_freesp(zp, 0, 0, mode, tx, cr);
+ if (error == 0) {
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ seq = zfs_log_truncate(zilog, tx,
+ TX_TRUNCATE, zp, 0, 0);
+ }
+ rw_exit(&zp->z_grow_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+out:
+
+ if (dl)
+ zfs_dirent_unlock(dl);
+
+ if (error) {
+ if (zp)
+ VN_RELE(ZTOV(zp));
+ } else {
+ *vpp = ZTOV(zp);
+ /*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+ if (IS_DEVVP(*vpp)) {
+ struct vnode *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL) {
+ error = ENOSYS;
+ }
+ *vpp = svp;
+ }
+ }
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+static int
+zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ znode_t *xzp = NULL;
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ uint64_t acl_obj, xattr_obj;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int may_delete_now, delete_now = FALSE;
+ int reaped;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ /*
+ * Check the restrictions that apply on sticky directories.
+ */
+ if (error = zfs_sticky_remove_access(dzp, zp, cr))
+ goto out;
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (vp->v_type == VDIR) {
+ error = EPERM;
+ goto out;
+ }
+
+ vnevent_remove(vp);
+
+ mutex_enter(&vp->v_lock);
+ may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * We may delete the znode now, or we may put it on the delete queue;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, -1);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ if (may_delete_now)
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+
+ /* are there any extended attributes? */
+ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
+ /*
+ * XXX - There is a possibility that the delete
+ * of the parent file could succeed, but then we get
+ * an ENOSPC when we try to delete the xattrs...
+ * so we would need to re-try the deletes periodically
+ */
+ /* XXX - do we need this if we are deleting? */
+ dmu_tx_hold_bonus(tx, xattr_obj);
+ }
+
+ /* are there any additional acls */
+ if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
+ may_delete_now)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_dirent_unlock(dl);
+ VN_RELE(vp);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dl, zp, tx, 0, &reaped);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (reaped) {
+ mutex_enter(&vp->v_lock);
+ delete_now = may_delete_now &&
+ vp->v_count == 1 && !vn_has_cached_data(vp) &&
+ zp->z_phys->zp_xattr == xattr_obj &&
+ zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+ mutex_exit(&vp->v_lock);
+ }
+
+ if (delete_now) {
+ if (zp->z_phys->zp_xattr) {
+ error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ ASSERT3U(error, ==, 0);
+ ASSERT3U(xzp->z_phys->zp_links, ==, 2);
+ dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_reap = 1;
+ xzp->z_phys->zp_links = 0;
+ mutex_exit(&xzp->z_lock);
+ zfs_dq_add(xzp, tx);
+ zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+ }
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&vp->v_lock);
+ vp->v_count--;
+ ASSERT3U(vp->v_count, ==, 0);
+ mutex_exit(&vp->v_lock);
+ zp->z_active = 0;
+ mutex_exit(&zp->z_lock);
+ zfs_znode_delete(zp, tx);
+ VFS_RELE(zfsvfs->z_vfs);
+ } else if (reaped) {
+ zfs_dq_add(zp, tx);
+ }
+
+ seq = zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+
+ dmu_tx_commit(tx);
+out:
+ zfs_dirent_unlock(dl);
+
+ if (!delete_now) {
+ VN_RELE(vp);
+ } else if (xzp) {
+ /* this rele delayed to prevent nesting transactions */
+ VN_RELE(ZTOV(xzp));
+ }
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+static int
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ zfs_dirlock_t *dl;
+ uint64_t zoid = 0;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(vap->va_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+top:
+ *vpp = NULL;
+ if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * First make sure the new directory doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+
+ *vpp = ZTOV(zp);
+
+ seq = zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp;
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ zp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ /*
+ * Check the restrictions that apply on sticky directories.
+ */
+ if (error = zfs_sticky_remove_access(dzp, zp, cr))
+ goto out;
+
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ if (vp == cwd) {
+ error = EINVAL;
+ goto out;
+ }
+
+ vnevent_rmdir(vp);
+
+ /*
+ * Grab a lock on the parent pointer make sure we play well
+ * with the treewalk and directory rename code.
+ */
+ rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ rw_exit(&zp->z_parent_lock);
+ zfs_dirent_unlock(dl);
+ VN_RELE(vp);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+
+ if (error == 0)
+ seq = zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_parent_lock);
+out:
+ zfs_dirent_unlock(dl);
+
+ VN_RELE(vp);
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure.
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp)
+{
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ ushort_t this_reclen;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ off64_t *next;
+ int local_eof;
+ int outcount = 0;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (uio->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_reap) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ offset = uio->uio_loffset;
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id,
+ offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = uio->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_first_integer = zp->z_id;
+ this_reclen = DIRENT64_RECLEN(1);
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_first_integer = zp->z_phys->zp_parent;
+ this_reclen = DIRENT64_RECLEN(2);
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_first_integer = ZFSCTL_INO_ROOT;
+ this_reclen =
+ DIRENT64_RECLEN(sizeof (ZFS_CTLDIR_NAME) - 1);
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if (error = zap_cursor_retrieve(&zc, &zap)) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = ENXIO;
+ goto update;
+ }
+ this_reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+ }
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + this_reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = EINVAL;
+ goto update;
+ }
+ break;
+ }
+ /*
+ * Add this entry:
+ */
+ odp->d_ino = (ino64_t)zap.za_first_integer;
+ odp->d_reclen = (ushort_t)this_reclen;
+ /* NOTE: d_off is the offset for the *next* entry */
+ next = &(odp->d_off);
+ (void) strncpy(odp->d_name, zap.za_name,
+ DIRENT64_NAMELEN(this_reclen));
+ outcount += this_reclen;
+ odp = (dirent64_t *)((intptr_t)odp + this_reclen);
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ dmu_prefetch(zfsvfs->z_os, zap.za_first_integer, 0, 0);
+
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+ *next = offset;
+ }
+
+ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ uio->uio_resid -= outcount;
+ } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+ /*
+ * Reset the pointer.
+ */
+ offset = uio->uio_loffset;
+ }
+
+update:
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ uio->uio_loffset = offset;
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ZFS_ENTER(zfsvfs);
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, FSYNC);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * flags - [UNUSED]
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_phys_t *pzp = zp->z_phys;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+ mutex_enter(&zp->z_lock);
+
+ vap->va_type = vp->v_type;
+ vap->va_mode = pzp->zp_mode & MODEMASK;
+ vap->va_uid = zp->z_phys->zp_uid;
+ vap->va_gid = zp->z_phys->zp_gid;
+ vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */
+ vap->va_size = pzp->zp_size;
+ vap->va_rdev = pzp->zp_rdev;
+ vap->va_seq = zp->z_seq;
+
+ ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+
+ /*
+ * Owner should be allowed to always read_attributes
+ */
+ if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
+ if (zp->z_phys->zp_uid != crgetuid(cr)) {
+ mutex_exit(&zp->z_lock);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: vp - vnode of file to be modified.
+ * vap - new attribute values.
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ struct znode *zp = VTOZ(vp);
+ znode_phys_t *pzp = zp->z_phys;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ dmu_tx_t *tx;
+ uint_t mask = vap->va_mask;
+ uint_t mask_applied = 0;
+ vattr_t oldva;
+ uint64_t new_mode;
+ int have_grow_lock;
+ int need_policy = FALSE;
+ int err;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_NOSET)
+ return (EINVAL);
+
+ if (mask & AT_SIZE && vp->v_type == VDIR)
+ return (EISDIR);
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ have_grow_lock = FALSE;
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (EROFS);
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME))
+ need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ (vap->va_mode & S_ISUID) != 0 &&
+ (mask & AT_UID) != 0 &&
+ vap->va_uid == 0) != 0) {
+ vap->va_mode = pzp->zp_mode;
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~(S_ISUID|S_ISGID);
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (mask & AT_MODE)
+ need_policy = TRUE;
+
+ if (need_policy) {
+ mutex_enter(&zp->z_lock);
+ oldva.va_mode = pzp->zp_mode;
+ oldva.va_uid = zp->z_phys->zp_uid;
+ oldva.va_gid = zp->z_phys->zp_gid;
+ mutex_exit(&zp->z_lock);
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (mask & AT_MODE) {
+
+ new_mode = (pzp->zp_mode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj)
+ dmu_tx_hold_write(tx,
+ pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
+ else
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+ }
+
+ if (mask & AT_SIZE) {
+ uint64_t off = vap->va_size;
+ /*
+ * Grab the grow_lock to serialize this change with
+ * respect to other file manipulations.
+ */
+ rw_enter(&zp->z_grow_lock, RW_WRITER);
+ have_grow_lock = TRUE;
+ if (off < zp->z_phys->zp_size)
+ dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
+ else if (zp->z_phys->zp_size &&
+ zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+ /* we will rewrite this block if we grow */
+ dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
+ }
+
+ err = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (err) {
+ dmu_tx_abort(tx);
+ if (have_grow_lock)
+ rw_exit(&zp->z_grow_lock);
+ if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+ if (mask & AT_SIZE) {
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, tx, cr);
+ if (err) {
+ mutex_enter(&zp->z_lock);
+ goto out;
+ }
+ mask_applied |= AT_SIZE;
+ }
+
+ mask_applied = mask; /* no errors after this point */
+
+ mutex_enter(&zp->z_lock);
+
+ if (mask & AT_MODE) {
+ err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+ ASSERT3U(err, ==, 0);
+ }
+
+ if ((mask & AT_UID) && vap->va_uid != oldva.va_uid)
+ zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+
+ if ((mask & AT_GID) && vap->va_gid != oldva.va_gid)
+ zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+
+ if (mask & AT_ATIME)
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+
+ if (mask & AT_MTIME)
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+
+ if (mask_applied & AT_SIZE)
+ zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
+ else if (mask_applied != 0)
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+out:
+ if (mask_applied != 0)
+ seq = zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap,
+ mask_applied);
+
+ mutex_exit(&zp->z_lock);
+
+ if (have_grow_lock)
+ rw_exit(&zp->z_grow_lock);
+
+ dmu_tx_commit(tx);
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+typedef struct zfs_zlock {
+ krwlock_t *zl_rwlock; /* lock we acquired */
+ znode_t *zl_znode; /* znode we held */
+ struct zfs_zlock *zl_next; /* next in list */
+} zfs_zlock_t;
+
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+ znode_t *zp = tdzp;
+ uint64_t rootid = zp->z_zfsvfs->z_root;
+ uint64_t *oidp = &zp->z_id;
+ krwlock_t *rwlp = &szp->z_parent_lock;
+ krw_t rw = RW_WRITER;
+
+ /*
+ * First pass write-locks szp and compares to zp->z_id.
+ * Later passes read-lock zp and compare to zp->z_parent.
+ */
+ do {
+ zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+ zl->zl_rwlock = rwlp;
+ zl->zl_znode = NULL;
+ zl->zl_next = *zlpp;
+ *zlpp = zl;
+
+ rw_enter(rwlp, rw);
+
+ if (*oidp == szp->z_id) /* We're a descendant of szp */
+ return (EINVAL);
+
+ if (*oidp == rootid) /* We've hit the top */
+ return (0);
+
+ if (rw == RW_READER) { /* i.e. not the first pass */
+ int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ if (error)
+ return (error);
+ zl->zl_znode = zp;
+ }
+ oidp = &zp->z_phys->zp_parent;
+ rwlp = &zp->z_parent_lock;
+ rw = RW_READER;
+
+ } while (zp->z_id != sdzp->z_id);
+
+ return (0);
+}
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+
+ while ((zl = *zlpp) != NULL) {
+ if (zl->zl_znode != NULL)
+ VN_RELE(ZTOV(zl->zl_znode));
+ rw_exit(zl->zl_rwlock);
+ *zlpp = zl->zl_next;
+ kmem_free(zl, sizeof (*zl));
+ }
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+static int
+zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
+{
+ znode_t *tdzp, *szp, *tzp;
+ znode_t *sdzp = VTOZ(sdvp);
+ zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ vnode_t *realvp;
+ zfs_dirlock_t *sdl, *tdl;
+ dmu_tx_t *tx;
+ zfs_zlock_t *zl;
+ int cmp, serr, terr, error;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Make sure we have the real vp for the target directory.
+ */
+ if (VOP_REALVP(tdvp, &realvp) == 0)
+ tdvp = realvp;
+
+ if (tdvp->v_vfsp != sdvp->v_vfsp) {
+ ZFS_EXIT(zfsvfs);
+ return (EXDEV);
+ }
+
+ tdzp = VTOZ(tdvp);
+top:
+ szp = NULL;
+ tzp = NULL;
+ zl = NULL;
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
+ (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Lock source and target directory entries. To prevent deadlock,
+ * a lock ordering must be defined. We lock the directory with
+ * the smallest object id first, or if it's a tie, the one with
+ * the lexically first name.
+ */
+ if (sdzp->z_id < tdzp->z_id) {
+ cmp = -1;
+ } else if (sdzp->z_id > tdzp->z_id) {
+ cmp = 1;
+ } else {
+ cmp = strcmp(snm, tnm);
+ if (cmp == 0) {
+ /*
+ * POSIX: "If the old argument and the new argument
+ * both refer to links to the same existing file,
+ * the rename() function shall return successfully
+ * and perform no other action."
+ */
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+ }
+ if (cmp < 0) {
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ } else {
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ }
+
+ if (serr) {
+ /*
+ * Source entry invalid or not there.
+ */
+ if (!terr) {
+ zfs_dirent_unlock(tdl);
+ if (tzp)
+ VN_RELE(ZTOV(tzp));
+ }
+ if (strcmp(snm, "..") == 0)
+ serr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (serr);
+ }
+ if (terr) {
+ zfs_dirent_unlock(sdl);
+ VN_RELE(ZTOV(szp));
+ if (strcmp(tnm, "..") == 0)
+ terr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (terr);
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+
+ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+ goto out;
+
+ if (ZTOV(szp)->v_type == VDIR) {
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
+ goto out;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if (ZTOV(szp)->v_type == VDIR) {
+ if (ZTOV(tzp)->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ } else {
+ if (ZTOV(tzp)->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ /*
+ * POSIX dictates that when the source and target
+ * entries refer to the same file object, rename
+ * must do nothing and exit without error.
+ */
+ if (szp->z_id == tzp->z_id) {
+ error = 0;
+ goto out;
+ }
+ }
+
+ vnevent_rename_src(ZTOV(szp));
+ if (tzp)
+ vnevent_rename_dest(ZTOV(tzp));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
+ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ if (sdzp != tdzp) {
+ dmu_tx_hold_zap(tx, sdzp->z_id, 1);
+ dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+ dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
+ } else {
+ dmu_tx_hold_zap(tx, sdzp->z_id, 2);
+ }
+ if (tzp) {
+ dmu_tx_hold_bonus(tx, tzp->z_id); /* nlink changes */
+ }
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+ VN_RELE(ZTOV(szp));
+ if (tzp)
+ VN_RELE(ZTOV(tzp));
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error == 0) {
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ ASSERT(error == 0);
+ seq = zfs_log_rename(zilog, tx, TX_RENAME,
+ sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+ }
+ }
+
+ dmu_tx_commit(tx);
+out:
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ VN_RELE(ZTOV(szp));
+ if (tzp)
+ VN_RELE(ZTOV(tzp));
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * target - Target path of new symlink.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr)
+{
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ uint64_t zoid;
+ int len = strlen(link);
+ int error;
+
+ ASSERT(vap->va_type == VLNK);
+
+ ZFS_ENTER(zfsvfs);
+top:
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (ENAMETOOLONG);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+
+ /*
+ * Create a new object for the symlink.
+ * Put the link content into bonus buffer if it will fit;
+ * otherwise, store it just like any other file data.
+ */
+ zoid = 0;
+ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+ if (len != 0)
+ bcopy(link, zp->z_phys + 1, len);
+ } else {
+ dmu_buf_t *dbp;
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ rw_enter(&zp->z_grow_lock, RW_WRITER);
+ error = zfs_grow_blocksize(zp, len, tx);
+ rw_exit(&zp->z_grow_lock);
+ if (error)
+ goto out;
+
+ dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp);
+ }
+ zp->z_phys->zp_size = len;
+
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+out:
+ if (error == 0)
+ seq = zfs_log_symlink(zilog, tx, TX_SYMLINK,
+ dzp, zp, name, link);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ VN_RELE(ZTOV(zp));
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uoip - structure to contain the link path.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - structure to contain the link path.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ size_t bufsz;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ bufsz = (size_t)zp->z_phys->zp_size;
+ if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
+ error = uiomove(zp->z_phys + 1,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
+ if ((error = dmu_buf_read_canfail(dbp)) != 0) {
+ dmu_buf_rele(dbp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp);
+ }
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
+{
+ znode_t *dzp = VTOZ(tdvp);
+ znode_t *tzp, *szp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ vnode_t *realvp;
+ int error;
+
+ ASSERT(tdvp->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (VOP_REALVP(svp, &realvp) == 0)
+ svp = realvp;
+
+ if (svp->v_vfsp != tdvp->v_vfsp) {
+ ZFS_EXIT(zfsvfs);
+ return (EXDEV);
+ }
+
+ szp = VTOZ(svp);
+top:
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
+ (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (svp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
+ secpolicy_basic_link(cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(dl, szp, tx, 0);
+
+ if (error == 0)
+ seq = zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * zfs_null_putapage() is used when the file system has been force
+ * unmounted. It just drops the pages.
+ */
+/* ARGSUSED */
+static int
+zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+ size_t *lenp, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ dmu_tx_t *tx;
+ u_offset_t off;
+ ssize_t len;
+ caddr_t va;
+ int err;
+
+top:
+ rw_enter(&zp->z_grow_lock, RW_READER);
+
+ off = pp->p_offset;
+ len = MIN(PAGESIZE, zp->z_phys->zp_size - off);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ err = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ rw_exit(&zp->z_grow_lock);
+ if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ goto out;
+ }
+
+ va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+
+ dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+
+ ppmapout(va);
+
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ seq = zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_grow_lock);
+
+ pvn_write_done(pp, B_WRITE | flags);
+ if (offp)
+ *offp = off;
+ if (lenp)
+ *lenp = len;
+
+ zil_commit(zilog, seq, 0);
+out:
+ return (err);
+}
+
+/*
+ * Copy the portion of the file indicated from pages into the file.
+ * The pages are stored in a page list attached to the files vnode.
+ *
+ * IN: vp - vnode of file to push page data to.
+ * off - position in file to put data.
+ * len - amount of data to write.
+ * flags - flags to control the operation.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ */
+static int
+zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t *pp;
+ size_t io_len;
+ u_offset_t io_off;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+ if (len == 0) {
+ /*
+ * Search the entire vp list for pages >= off.
+ */
+ error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
+ flags, cr);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (off > zp->z_phys->zp_size) {
+ /* past end of file */
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ len = MIN(len, zp->z_phys->zp_size - off);
+
+ io_off = off;
+ while (io_off < off + len) {
+ if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
+ pp = page_lookup(vp, io_off,
+ (flags & (B_INVAL | B_FREE)) ?
+ SE_EXCL : SE_SHARED);
+ } else {
+ pp = page_lookup_nowait(vp, io_off,
+ (flags & B_FREE) ? SE_EXCL : SE_SHARED);
+ }
+
+ if (pp != NULL && pvn_getdirty(pp, flags)) {
+ int err;
+
+ /*
+ * Found a dirty page to push
+ */
+ if (err =
+ zfs_putapage(vp, pp, &io_off, &io_len, flags, cr))
+ error = err;
+ } else {
+ io_len = PAGESIZE;
+ }
+ io_off += io_len;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+void
+zfs_inactive(vnode_t *vp, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ rw_enter(&zfsvfs->z_um_lock, RW_READER);
+ if (zfsvfs->z_unmounted2) {
+ ASSERT(zp->z_dbuf_held == 0);
+
+ if (vn_has_cached_data(vp)) {
+ (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
+ B_INVAL, cr);
+ }
+
+ vp->v_count = 0; /* count arrives as 1 */
+ zfs_znode_free(zp);
+ rw_exit(&zfsvfs->z_um_lock);
+ VFS_RELE(zfsvfs->z_vfs);
+ return;
+ }
+
+ /*
+ * Attempt to push any data in the page cache. If this fails
+ * we will get kicked out later in zfs_zinactive().
+ */
+ if (vn_has_cached_data(vp))
+ (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL, cr);
+
+ if (zp->z_atime_dirty && zp->z_reap == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+ zp->z_atime_dirty = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+
+ zfs_zinactive(zp);
+ rw_exit(&zfsvfs->z_um_lock);
+}
+
+/*
+ * Bounds-check the seek operation.
+ *
+ * IN: vp - vnode seeking within
+ * ooff - old file offset
+ * noffp - pointer to new file offset
+ *
+ * RETURN: 0 if success
+ * EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+static int
+zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
+{
+ if (vp->v_type == VDIR)
+ return (0);
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Pre-filter the generic locking function to trap attempts to place
+ * a mandatory lock on a memory mapped file.
+ */
+static int
+zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+ flk_callback_t *flk_cbp, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint_t cnt = 1;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * If file is being mapped, disallow frlock. We set the mapcnt to
+ * -1 here to signal that we are in the process of setting a lock.
+ * This prevents a race with zfs_map().
+ * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
+ * we could be in the middle of zfs_map() and still call fs_frlock().
+ * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
+ * *is* manipulated).
+ */
+ if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+ (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+ error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
+ ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * If we can't find a page in the cache, we will create a new page
+ * and fill it with file data. For efficiency, we may try to fill
+ * multiple pages as once (klustering).
+ */
+static int
+zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
+ caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+{
+ znode_t *zp = VTOZ(vp);
+ page_t *pp, *cur_pp;
+ objset_t *os = zp->z_zfsvfs->z_os;
+ caddr_t va;
+ u_offset_t io_off, total;
+ uint64_t oid = zp->z_id;
+ size_t io_len;
+ int err;
+
+ /*
+ * If we are only asking for a single page don't bother klustering.
+ */
+ if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE ||
+ off > zp->z_phys->zp_size) {
+ io_off = off;
+ io_len = PAGESIZE;
+ pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
+ } else {
+ /*
+ * Try to fill a kluster of pages (a blocks worth).
+ */
+ size_t klen;
+ u_offset_t koff;
+
+ if (!ISP2(zp->z_blksz)) {
+ /* Only one block in the file. */
+ klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+ koff = 0;
+ } else {
+ klen = plsz;
+ koff = P2ALIGN(off, (u_offset_t)klen);
+ }
+ if (klen > zp->z_phys->zp_size)
+ klen = P2ROUNDUP(zp->z_phys->zp_size,
+ (uint64_t)PAGESIZE);
+ pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
+ &io_len, koff, klen, 0);
+ }
+ if (pp == NULL) {
+ /*
+ * Some other thread entered the page before us.
+ * Return to zfs_getpage to retry the lookup.
+ */
+ *pl = NULL;
+ return (0);
+ }
+
+ /*
+ * Fill the pages in the kluster.
+ */
+ cur_pp = pp;
+ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ ASSERT(io_off == cur_pp->p_offset);
+ va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+ err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+ ppmapout(va);
+ if (err) {
+ /* On error, toss the entire kluster */
+ pvn_read_done(pp, B_ERROR);
+ return (err);
+ }
+ cur_pp = cur_pp->p_next;
+ }
+out:
+ /*
+ * Fill in the page list array from the kluster. If
+ * there are too many pages in the kluster, return
+ * as many pages as possible starting from the desired
+ * offset `off'.
+ * NOTE: the page list will always be null terminated.
+ */
+ pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+
+ return (0);
+}
+
+/*
+ * Return pointers to the pages for the file region [off, off + len]
+ * in the pl array. If plsz is greater than len, this function may
+ * also return page pointers from before or after the specified
+ * region (i.e. some region [off', off' + plsz]). These additional
+ * pages are only returned if they are already in the cache, or were
+ * created as part of a klustered read.
+ *
+ * IN: vp - vnode of file to get data from.
+ * off - position in file to get data from.
+ * len - amount of data to retrieve.
+ * plsz - length of provided page list.
+ * seg - segment to obtain pages for.
+ * addr - virtual address of fault.
+ * rw - mode of created pages.
+ * cr - credentials of caller.
+ *
+ * OUT: protp - protection mode of created pages.
+ * pl - list of pages created.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+ enum seg_rw rw, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ page_t *pp, **pl0 = pl;
+ int cnt = 0, need_unlock = 0, err = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (protp)
+ *protp = PROT_ALL;
+
+ ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+ /* no faultahead (for now) */
+ if (pl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* can't fault past EOF */
+ if (off >= zp->z_phys->zp_size) {
+ ZFS_EXIT(zfsvfs);
+ return (EFAULT);
+ }
+
+ /*
+ * Make sure nobody restructures the file (changes block size)
+ * in the middle of the getpage.
+ */
+ rw_enter(&zp->z_grow_lock, RW_READER);
+
+ /*
+ * If we already own the lock, then we must be page faulting
+ * in the middle of a write to this file (i.e., we are writing
+ * to this file using data from a mapped region of the file).
+ */
+ if (!rw_owner(&zp->z_map_lock)) {
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ need_unlock = TRUE;
+ }
+
+ /*
+ * Loop through the requested range [off, off + len] looking
+ * for pages. If we don't find a page, we will need to create
+ * a new page and fill it with data from the file.
+ */
+ while (len > 0) {
+ if (plsz < PAGESIZE)
+ break;
+ if (pp = page_lookup(vp, off, SE_SHARED)) {
+ *pl++ = pp;
+ off += PAGESIZE;
+ addr += PAGESIZE;
+ len -= PAGESIZE;
+ plsz -= PAGESIZE;
+ } else {
+ err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
+ /*
+ * klustering may have changed our region
+ * to be block aligned.
+ */
+ if (((pp = *pl) != 0) && (off != pp->p_offset)) {
+ int delta = off - pp->p_offset;
+ len += delta;
+ off -= delta;
+ addr -= delta;
+ }
+ while (*pl) {
+ pl++;
+ cnt++;
+ off += PAGESIZE;
+ addr += PAGESIZE;
+ plsz -= PAGESIZE;
+ if (len > PAGESIZE)
+ len -= PAGESIZE;
+ else
+ len = 0;
+ }
+ }
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Fill out the page array with any pages already in the cache.
+ */
+ while (plsz > 0) {
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+ if (pp == NULL)
+ break;
+ *pl++ = pp;
+ off += PAGESIZE;
+ plsz -= PAGESIZE;
+ }
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+out:
+ if (err) {
+ /*
+ * Release any pages we have locked.
+ */
+ while (pl > pl0)
+ page_unlock(*--pl);
+ }
+ *pl = NULL;
+
+ if (need_unlock)
+ rw_exit(&zp->z_map_lock);
+ rw_exit(&zp->z_grow_lock);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+static int
+zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ segvn_crargs_t vn_a;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (vp->v_flag & VNOMAP) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOSYS);
+ }
+
+ if (off < 0 || len > MAXOFFSET_T - off) {
+ ZFS_EXIT(zfsvfs);
+ return (ENXIO);
+ }
+
+ if (vp->v_type != VREG) {
+ ZFS_EXIT(zfsvfs);
+ return (ENODEV);
+ }
+
+ /*
+ * If file is locked, disallow mapping.
+ * XXX - since we don't modify z_mapcnt here, there is nothing
+ * to stop a file lock being placed immediately after we complete
+ * this check.
+ */
+ if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
+ }
+ }
+
+ as_rangelock(as);
+ if ((flags & MAP_FIXED) == 0) {
+ map_addr(addrp, len, off, 1, flags);
+ if (*addrp == NULL) {
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (ENOMEM);
+ }
+ } else {
+ /*
+ * User specified address - blow away any previous mappings
+ */
+ (void) as_unmap(as, *addrp, len);
+ }
+
+ vn_a.vp = vp;
+ vn_a.offset = (u_offset_t)off;
+ vn_a.type = flags & MAP_TYPE;
+ vn_a.prot = prot;
+ vn_a.maxprot = maxprot;
+ vn_a.cred = cr;
+ vn_a.amp = NULL;
+ vn_a.flags = flags & ~MAP_TYPE;
+
+ error = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+ as_rangeunlock(as);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+ /*
+ * XXX - shouldn't we be checking for file locks here?
+ */
+ ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+ atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
+{
+ atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
+ ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+ return (0);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: vp - vnode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller [UNUSED].
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated
+ *
+ * NOTE: This function is limited in that it will only permit space to
+ * be freed at the end of a file. In essence, this function simply
+ * allows one to set the file size.
+ */
+/* ARGSUSED */
+static int
+zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ dmu_tx_t *tx;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t seq = 0;
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = convoff(vp, bfp, 0, offset)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ /*
+ * Grab the grow_lock to serialize this change with
+ * respect to other file size changes.
+ */
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ rw_enter(&zp->z_grow_lock, RW_WRITER);
+ if (off + len > zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz &&
+ off >= zp->z_phys->zp_size) {
+ /*
+ * We are increasing the length of the file,
+ * and this may mean a block size increase.
+ */
+ dmu_tx_hold_write(tx, zp->z_id, 0,
+ MIN(off + len, zfsvfs->z_max_blksz));
+ } else if (off < zp->z_phys->zp_size) {
+ /*
+ * If len == 0, we are truncating the file.
+ */
+ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ rw_exit(&zp->z_grow_lock);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+ goto top;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_freesp(zp, off, len, flag, tx, cr);
+
+ if (error == 0) {
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+ }
+
+ rw_exit(&zp->z_grow_lock);
+
+ dmu_tx_commit(tx);
+
+ zil_commit(zilog, seq, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen = (uint32_t)zp->z_phys->zp_gen;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i;
+
+ ZFS_ENTER(zfsvfs);
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+ if (fidp->fid_len < size) {
+ fidp->fid_len = size;
+ return (ENOSPC);
+ }
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
+{
+ znode_t *zp, *xzp;
+ zfsvfs_t *zfsvfs;
+ zfs_dirlock_t *dl;
+ int error;
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = ULONG_MAX;
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+
+ case _PC_XATTR_EXISTS:
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ *valp = 0;
+ error = zfs_dirent_lock(&dl, zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
+ if (error == 0) {
+ zfs_dirent_unlock(dl);
+ if (!zfs_dirempty(xzp))
+ *valp = 1;
+ VN_RELE(ZTOV(xzp));
+ } else if (error == ENOENT) {
+ /*
+ * If there aren't extended attributes, it's the
+ * same as having zero of them.
+ */
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+
+ case _PC_ACL_ENABLED:
+ *valp = _ACL_ACE_ENABLED;
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (ulong_t)SPA_MINBLOCKSIZE;
+ return (0);
+
+ default:
+ return (fs_pathconf(vp, cmd, valp, cr));
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_getacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_setacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Predeclare these here so that the compiler assumes that
+ * this is an "old style" function declaration that does
+ * not include arguments => we won't get type mismatch errors
+ * in the initializations that follow.
+ */
+static int zfs_inval();
+static int zfs_isdir();
+
+static int
+zfs_inval()
+{
+ return (EINVAL);
+}
+
+static int
+zfs_isdir()
+{
+ return (EISDIR);
+}
+/*
+ * Directory vnode operations template
+ */
+vnodeops_t *zfs_dvnodeops;
+const fs_operation_def_t zfs_dvnodeops_template[] = {
+ VOPNAME_OPEN, zfs_open,
+ VOPNAME_CLOSE, zfs_close,
+ VOPNAME_READ, zfs_isdir,
+ VOPNAME_WRITE, zfs_isdir,
+ VOPNAME_IOCTL, zfs_ioctl,
+ VOPNAME_GETATTR, zfs_getattr,
+ VOPNAME_SETATTR, zfs_setattr,
+ VOPNAME_ACCESS, zfs_access,
+ VOPNAME_LOOKUP, zfs_lookup,
+ VOPNAME_CREATE, zfs_create,
+ VOPNAME_REMOVE, zfs_remove,
+ VOPNAME_LINK, zfs_link,
+ VOPNAME_RENAME, zfs_rename,
+ VOPNAME_MKDIR, zfs_mkdir,
+ VOPNAME_RMDIR, zfs_rmdir,
+ VOPNAME_READDIR, zfs_readdir,
+ VOPNAME_SYMLINK, zfs_symlink,
+ VOPNAME_FSYNC, zfs_fsync,
+ VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+ VOPNAME_FID, zfs_fid,
+ VOPNAME_SEEK, zfs_seek,
+ VOPNAME_PATHCONF, zfs_pathconf,
+ VOPNAME_GETSECATTR, zfs_getsecattr,
+ VOPNAME_SETSECATTR, zfs_setsecattr,
+ NULL, NULL
+};
+
+/*
+ * Regular file vnode operations template
+ */
+vnodeops_t *zfs_fvnodeops;
+const fs_operation_def_t zfs_fvnodeops_template[] = {
+ VOPNAME_OPEN, zfs_open,
+ VOPNAME_CLOSE, zfs_close,
+ VOPNAME_READ, zfs_read,
+ VOPNAME_WRITE, zfs_write,
+ VOPNAME_IOCTL, zfs_ioctl,
+ VOPNAME_GETATTR, zfs_getattr,
+ VOPNAME_SETATTR, zfs_setattr,
+ VOPNAME_ACCESS, zfs_access,
+ VOPNAME_LOOKUP, zfs_lookup,
+ VOPNAME_RENAME, zfs_rename,
+ VOPNAME_FSYNC, zfs_fsync,
+ VOPNAME_INACTIVE, (fs_generic_func_p)zfs_inactive,
+ VOPNAME_FID, zfs_fid,
+ VOPNAME_SEEK, zfs_seek,
+ VOPNAME_FRLOCK, zfs_frlock,
+ VOPNAME_SPACE, zfs_space,
+ VOPNAME_GETPAGE, zfs_getpage,
+ VOPNAME_PUTPAGE, zfs_putpage,
+ VOPNAME_MAP, (fs_generic_func_p) zfs_map,
+ VOPNAME_ADDMAP, (fs_generic_func_p) zfs_addmap,
+ VOPNAME_DELMAP, zfs_delmap,
+ VOPNAME_PATHCONF, zfs_pathconf,
+ VOPNAME_GETSECATTR, zfs_getsecattr,
+ VOPNAME_SETSECATTR, zfs_setsecattr,
+ VOPNAME_VNEVENT, fs_vnevent_support,
+ NULL, NULL
+};
+
+/*
+ * Symbolic link vnode operations template
+ */
+vnodeops_t *zfs_symvnodeops;
+const fs_operation_def_t zfs_symvnodeops_template[] = {
+ VOPNAME_GETATTR, zfs_getattr,
+ VOPNAME_SETATTR, zfs_setattr,
+ VOPNAME_ACCESS, zfs_access,
+ VOPNAME_RENAME, zfs_rename,
+ VOPNAME_READLINK, zfs_readlink,
+ VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+ VOPNAME_FID, zfs_fid,
+ VOPNAME_PATHCONF, zfs_pathconf,
+ VOPNAME_VNEVENT, fs_vnevent_support,
+ NULL, NULL
+};
+
+/*
+ * Extended attribute directory vnode operations template
+ * This template is identical to the directory vnodes
+ * operation template except for restricted operations:
+ * VOP_MKDIR()
+ * VOP_SYMLINK()
+ * Note that there are other restrictions embedded in:
+ * zfs_create() - restrict type to VREG
+ * zfs_link() - no links into/out of attribute space
+ * zfs_rename() - no moves into/out of attribute space
+ */
+vnodeops_t *zfs_xdvnodeops;
+const fs_operation_def_t zfs_xdvnodeops_template[] = {
+ VOPNAME_OPEN, zfs_open,
+ VOPNAME_CLOSE, zfs_close,
+ VOPNAME_IOCTL, zfs_ioctl,
+ VOPNAME_GETATTR, zfs_getattr,
+ VOPNAME_SETATTR, zfs_setattr,
+ VOPNAME_ACCESS, zfs_access,
+ VOPNAME_LOOKUP, zfs_lookup,
+ VOPNAME_CREATE, zfs_create,
+ VOPNAME_REMOVE, zfs_remove,
+ VOPNAME_LINK, zfs_link,
+ VOPNAME_RENAME, zfs_rename,
+ VOPNAME_MKDIR, zfs_inval,
+ VOPNAME_RMDIR, zfs_rmdir,
+ VOPNAME_READDIR, zfs_readdir,
+ VOPNAME_SYMLINK, zfs_inval,
+ VOPNAME_FSYNC, zfs_fsync,
+ VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+ VOPNAME_FID, zfs_fid,
+ VOPNAME_SEEK, zfs_seek,
+ VOPNAME_PATHCONF, zfs_pathconf,
+ VOPNAME_GETSECATTR, zfs_getsecattr,
+ VOPNAME_SETSECATTR, zfs_setsecattr,
+ VOPNAME_VNEVENT, fs_vnevent_support,
+ NULL, NULL
+};
+
+/*
+ * Error vnode operations template
+ */
+vnodeops_t *zfs_evnodeops;
+const fs_operation_def_t zfs_evnodeops_template[] = {
+ VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+ VOPNAME_PATHCONF, zfs_pathconf,
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 0000000000..1ff11e29b8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,1286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/atomic.h>
+#include <vm/pvn.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/fs/zfs.h>
+
+struct kmem_cache *znode_cache = NULL;
+
+/*
+ * Note that znodes can be on one of 2 states:
+ * ZCACHE_mru - recently used, currently cached
+ * ZCACHE_mfu - frequently used, currently cached
+ * When there are no active references to the znode, they
+ * are linked onto one of the lists in zcache. These are the
+ * only znodes that can be evicted.
+ */
+
+typedef struct zcache_state {
+ list_t list; /* linked list of evictable znodes in state */
+ uint64_t lcnt; /* total number of znodes in the linked list */
+ uint64_t cnt; /* total number of all znodes in this state */
+ uint64_t hits;
+ kmutex_t mtx;
+} zcache_state_t;
+
+/* The 2 states: */
+static zcache_state_t ZCACHE_mru;
+static zcache_state_t ZCACHE_mfu;
+
+static struct zcache {
+ zcache_state_t *mru;
+ zcache_state_t *mfu;
+ uint64_t p; /* Target size of mru */
+ uint64_t c; /* Target size of cache */
+ uint64_t c_max; /* Maximum target cache size */
+
+ /* performance stats */
+ uint64_t missed;
+ uint64_t evicted;
+ uint64_t skipped;
+} zcache;
+
+void zcache_kmem_reclaim(void);
+
+#define ZCACHE_MINTIME (hz>>4) /* 62 ms */
+
+/*
+ * Move the supplied znode to the indicated state. The mutex
+ * for the znode must be held by the caller.
+ */
+static void
+zcache_change_state(zcache_state_t *new_state, znode_t *zp)
+{
+ /* ASSERT(MUTEX_HELD(hash_mtx)); */
+ ASSERT(zp->z_active);
+
+ if (zp->z_zcache_state) {
+ ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+ atomic_add_64(&zp->z_zcache_state->cnt, -1);
+ }
+ atomic_add_64(&new_state->cnt, 1);
+ zp->z_zcache_state = new_state;
+}
+
+static void
+zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ ASSERT(zp->z_phys);
+ ASSERT(zp->z_dbuf_held);
+
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_buf_rele(zp->z_dbuf);
+ mutex_exit(hash_mtx);
+ VFS_RELE(zfsvfs->z_vfs);
+}
+
+/*
+ * Evict znodes from list until we've removed the specified number
+ */
+static void
+zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
+{
+ int znodes_evicted = 0;
+ znode_t *zp, *zp_prev;
+ kmutex_t *hash_mtx;
+
+ ASSERT(state == zcache.mru || state == zcache.mfu);
+
+ mutex_enter(&state->mtx);
+
+ for (zp = list_tail(&state->list); zp; zp = zp_prev) {
+ zp_prev = list_prev(&state->list, zp);
+ if (zfsvfs && zp->z_zfsvfs != zfsvfs)
+ continue;
+ hash_mtx = ZFS_OBJ_MUTEX(zp);
+ if (mutex_tryenter(hash_mtx)) {
+ mutex_enter(&zp->z_lock);
+ list_remove(&zp->z_zcache_state->list, zp);
+ zp->z_zcache_state->lcnt -= 1;
+ ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+ atomic_add_64(&zp->z_zcache_state->cnt, -1);
+ zp->z_zcache_state = NULL;
+ zp->z_zcache_access = 0;
+ /* drops z_lock and hash_mtx */
+ zfs_zcache_evict(zp, hash_mtx);
+ znodes_evicted += 1;
+ atomic_add_64(&zcache.evicted, 1);
+ if (znodes_evicted >= cnt)
+ break;
+ } else {
+ atomic_add_64(&zcache.skipped, 1);
+ }
+ }
+ mutex_exit(&state->mtx);
+
+ if (znodes_evicted < cnt)
+ dprintf("only evicted %lld znodes from %x",
+ (longlong_t)znodes_evicted, state);
+}
+
+static void
+zcache_adjust(void)
+{
+ uint64_t mrucnt = zcache.mru->lcnt;
+ uint64_t mfucnt = zcache.mfu->lcnt;
+ uint64_t p = zcache.p;
+ uint64_t c = zcache.c;
+
+ if (mrucnt > p)
+ zcache_evict_state(zcache.mru, mrucnt - p, NULL);
+
+ if (mfucnt > 0 && mrucnt + mfucnt > c) {
+ int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
+ zcache_evict_state(zcache.mfu, toevict, NULL);
+ }
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+zfs_zcache_flush(zfsvfs_t *zfsvfs)
+{
+ zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
+ zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
+}
+
+static void
+zcache_try_grow(int64_t cnt)
+{
+ int64_t size;
+ /*
+ * If we're almost to the current target cache size,
+ * increment the target cache size
+ */
+ size = zcache.mru->lcnt + zcache.mfu->lcnt;
+ if ((zcache.c - size) <= 1) {
+ atomic_add_64(&zcache.c, cnt);
+ if (zcache.c > zcache.c_max)
+ zcache.c = zcache.c_max;
+ else if (zcache.p + cnt < zcache.c)
+ atomic_add_64(&zcache.p, cnt);
+ }
+}
+
+/*
+ * This routine is called whenever a znode is accessed.
+ */
+static void
+zcache_access(znode_t *zp, kmutex_t *hash_mtx)
+{
+ ASSERT(MUTEX_HELD(hash_mtx));
+
+ if (zp->z_zcache_state == NULL) {
+ /*
+ * This znode is not in the cache.
+ * Add the new znode to the MRU state.
+ */
+
+ zcache_try_grow(1);
+
+ ASSERT(zp->z_zcache_access == 0);
+ zp->z_zcache_access = lbolt;
+ zcache_change_state(zcache.mru, zp);
+ mutex_exit(hash_mtx);
+
+ /*
+ * If we are using less than 2/3 of our total target
+ * cache size, bump up the target size for the MRU
+ * list.
+ */
+ if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
+ zcache.p = zcache.mru->lcnt + zcache.c/6;
+ }
+
+ zcache_adjust();
+
+ atomic_add_64(&zcache.missed, 1);
+ } else if (zp->z_zcache_state == zcache.mru) {
+ /*
+ * This znode has been "accessed" only once so far,
+ * Move it to the MFU state.
+ */
+ if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ zp->z_zcache_access = lbolt;
+ zcache_change_state(zcache.mfu, zp);
+ }
+ atomic_add_64(&zcache.mru->hits, 1);
+ mutex_exit(hash_mtx);
+ } else {
+ ASSERT(zp->z_zcache_state == zcache.mfu);
+ /*
+ * This buffer has been accessed more than once.
+ * Keep it in the MFU state.
+ */
+ atomic_add_64(&zcache.mfu->hits, 1);
+ mutex_exit(hash_mtx);
+ }
+}
+
+static void
+zcache_init(void)
+{
+ zcache.c = 20;
+ zcache.c_max = 50;
+
+ zcache.mru = &ZCACHE_mru;
+ zcache.mfu = &ZCACHE_mfu;
+
+ list_create(&zcache.mru->list, sizeof (znode_t),
+ offsetof(znode_t, z_zcache_node));
+ list_create(&zcache.mfu->list, sizeof (znode_t),
+ offsetof(znode_t, z_zcache_node));
+}
+
+static void
+zcache_fini(void)
+{
+ zfs_zcache_flush(NULL);
+
+ list_destroy(&zcache.mru->list);
+ list_destroy(&zcache.mfu->list);
+}
+
+/*ARGSUSED*/
+static void
+znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+{
+ znode_t *zp = user_ptr;
+ vnode_t *vp = ZTOV(zp);
+
+ if (vp->v_count == 0) {
+ vn_invalid(vp);
+ zfs_znode_free(zp);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ zp->z_vnode = vn_alloc(KM_SLEEP);
+ zp->z_vnode->v_data = (caddr_t)zp;
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+ zp->z_dbuf_held = 0;
+ zp->z_dirlocks = 0;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(zp->z_dirlocks == 0);
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_map_lock);
+ rw_destroy(&zp->z_grow_lock);
+ rw_destroy(&zp->z_append_lock);
+ mutex_destroy(&zp->z_acl_lock);
+
+ ASSERT(zp->z_dbuf_held == 0);
+ ASSERT(ZTOV(zp)->v_count == 0);
+ vn_free(ZTOV(zp));
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+
+ zcache_init();
+}
+
+void
+zfs_znode_fini(void)
+{
+ zcache_fini();
+
+ /*
+ * Cleanup vfs & vnode ops
+ */
+ zfs_remove_op_tables();
+
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+}
+
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+
+void
+zfs_remove_op_tables()
+{
+ /*
+ * Remove vfs ops
+ */
+ ASSERT(zfsfstype);
+ (void) vfs_freevfsops_by_type(zfsfstype);
+ zfsfstype = 0;
+
+ /*
+ * Remove vnode ops
+ */
+ if (zfs_dvnodeops)
+ vn_freevnodeops(zfs_dvnodeops);
+ if (zfs_fvnodeops)
+ vn_freevnodeops(zfs_fvnodeops);
+ if (zfs_symvnodeops)
+ vn_freevnodeops(zfs_symvnodeops);
+ if (zfs_xdvnodeops)
+ vn_freevnodeops(zfs_xdvnodeops);
+ if (zfs_evnodeops)
+ vn_freevnodeops(zfs_evnodeops);
+
+ zfs_dvnodeops = NULL;
+ zfs_fvnodeops = NULL;
+ zfs_symvnodeops = NULL;
+ zfs_xdvnodeops = NULL;
+ zfs_evnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+ int error;
+
+ /*
+ * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+ * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+ * In this case we just return as the ops vectors are already set up.
+ */
+ if (zfs_dvnodeops)
+ return (0);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+ &zfs_dvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+ &zfs_fvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+ &zfs_symvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+ &zfs_xdvnodeops);
+ if (error)
+ return (error);
+
+ error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+ &zfs_evnodeops);
+
+ return (error);
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ * incore "master" object. Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+ extern int zfsfstype;
+
+ objset_t *os = zfsvfs->z_os;
+ uint64_t zoid;
+ uint64_t version = ZFS_VERSION;
+ int i, error;
+ dmu_object_info_t doi;
+ dmu_objset_stats_t *stats;
+
+ *zpp = NULL;
+
+ /*
+ * XXX - hack to auto-create the pool root filesystem at
+ * the first attempted mount.
+ */
+ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ ASSERT3U(error, ==, 0);
+ zfs_create_fs(os, cr, tx);
+ dmu_tx_commit(tx);
+ }
+
+ if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
+ return (EINVAL);
+ } else if (version != ZFS_VERSION) {
+ (void) printf("Mismatched versions: File system "
+ "is version %lld on-disk format, which is "
+ "incompatible with this software version %lld!",
+ (u_longlong_t)version, ZFS_VERSION);
+ return (ENOTSUP);
+ }
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP);
+ dmu_objset_stats(os, stats);
+ ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0);
+ zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid;
+ zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) |
+ zfsfstype & 0xFF;
+ kmem_free(stats, sizeof (dmu_objset_stats_t));
+ stats = NULL;
+
+ if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
+ return (EINVAL);
+ }
+ ASSERT(zoid != 0);
+ zfsvfs->z_root = zoid;
+
+ /*
+ * Create the per mount vop tables.
+ */
+
+ /*
+ * Initialize zget mutex's
+ */
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error)
+ return (error);
+ ASSERT3U((*zpp)->z_id, ==, zoid);
+
+ if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
+ return (EINVAL);
+ }
+
+ zfsvfs->z_dqueue = zoid;
+
+ /*
+ * Initialize delete head structure
+ * Thread(s) will be started/stopped via
+ * readonly_changed_cb() depending
+ * on whether this is rw/ro mount.
+ */
+ list_create(&zfsvfs->z_delete_head.z_znodes,
+ sizeof (znode_t), offsetof(znode_t, z_list_node));
+
+ return (0);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+{
+ znode_t *zp;
+ vnode_t *vp;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+ ASSERT(zp->z_dirlocks == NULL);
+
+ zp->z_phys = db->db_data;
+ zp->z_zfsvfs = zfsvfs;
+ zp->z_active = 1;
+ zp->z_reap = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_dbuf_held = 0;
+ zp->z_mapcnt = 0;
+ zp->z_last_itx = 0;
+ zp->z_dbuf = db;
+ zp->z_id = obj_num;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+
+ bzero(&zp->z_zcache_node, sizeof (list_node_t));
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ vp = ZTOV(zp);
+ vn_reinit(vp);
+
+ vp->v_vfsp = zfsvfs->z_parent->z_vfs;
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+ switch (vp->v_type) {
+ case VDIR:
+ if (zp->z_phys->zp_flags & ZFS_XATTR) {
+ vn_setops(vp, zfs_xdvnodeops);
+ vp->v_flag |= V_XATTRDIR;
+ } else
+ vn_setops(vp, zfs_dvnodeops);
+ break;
+ case VBLK:
+ case VCHR:
+ vp->v_rdev = (dev_t)zp->z_phys->zp_rdev;
+ /*FALLTHROUGH*/
+ case VFIFO:
+ case VSOCK:
+ case VDOOR:
+ vn_setops(vp, zfs_fvnodeops);
+ break;
+ case VREG:
+ vp->v_flag |= VMODSORT;
+ vn_setops(vp, zfs_fvnodeops);
+ break;
+ case VLNK:
+ vn_setops(vp, zfs_symvnodeops);
+ break;
+ default:
+ vn_setops(vp, zfs_evnodeops);
+ break;
+ }
+
+ return (zp);
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp)
+{
+ znode_t *nzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_buf_t *db = zp->z_dbuf;
+
+ mutex_enter(&zp->z_lock);
+
+ nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
+
+ /*
+ * there should be no
+ * concurrent zgets on this object.
+ */
+ ASSERT3P(nzp, ==, NULL);
+
+ /*
+ * Slap on VROOT if we are the root znode
+ */
+ if (zp->z_id == zfsvfs->z_root) {
+ ZTOV(zp)->v_flag |= VROOT;
+ }
+
+ zp->z_zcache_state = NULL;
+ zp->z_zcache_access = 0;
+
+ ASSERT(zp->z_dbuf_held == 0);
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ mutex_exit(&zp->z_lock);
+ vn_exists(ZTOV(zp));
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * IS_REPLAY - intent log replay
+ *
+ * OUT: oid - ID of created object
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, int bonuslen)
+{
+ dmu_buf_t *dbp;
+ znode_phys_t *pzp;
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ timestruc_t now;
+ uint64_t gen;
+ int err;
+
+ ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ *oid = vap->va_nodeid;
+ flag |= IS_REPLAY;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ } else {
+ *oid = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ }
+
+ /*
+ * Create a new DMU object.
+ */
+ if (vap->va_type == VDIR) {
+ if (flag & IS_REPLAY) {
+ err = zap_create_claim(zfsvfs->z_os, *oid,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = zap_create(zfsvfs->z_os,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ } else {
+ if (flag & IS_REPLAY) {
+ err = dmu_object_claim(zfsvfs->z_os, *oid,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ }
+ dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Initialize the znode physical data to zero.
+ */
+ ASSERT(dbp->db_size >= sizeof (znode_phys_t));
+ bzero(dbp->db_data, dbp->db_size);
+ pzp = dbp->db_data;
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_phys = pzp;
+ dzp->z_id = *oid;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ flag |= IS_XATTR;
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ pzp->zp_rdev = vap->va_rdev;
+ }
+
+ if (vap->va_type == VDIR) {
+ pzp->zp_size = 2; /* contents ("." and "..") */
+ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ }
+
+ pzp->zp_parent = dzp->z_id;
+ if (flag & IS_XATTR)
+ pzp->zp_flags |= ZFS_XATTR;
+
+ pzp->zp_gen = gen;
+
+ ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+ ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ }
+
+ pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
+
+ zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+
+ if (zpp) {
+ kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+
+ mutex_enter(hash_mtx);
+ zfs_znode_dmu_init(zp);
+ zcache_access(zp, hash_mtx);
+ *zpp = zp;
+ } else {
+ ZTOV(zp)->v_count = 0;
+ dmu_buf_rele(dbp);
+ zfs_znode_free(zp);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+
+ *zpp = NULL;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
+ if (db == NULL) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (ENOENT);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+ dmu_buf_read(db);
+
+ ASSERT(db->db_object == obj_num);
+ ASSERT(db->db_offset == -1);
+ ASSERT(db->db_data != NULL);
+
+ zp = dmu_buf_get_user(db);
+
+ if (zp != NULL) {
+ mutex_enter(&zp->z_lock);
+
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_reap) {
+ dmu_buf_rele(db);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (ENOENT);
+ } else if (zp->z_dbuf_held) {
+ dmu_buf_rele(db);
+ } else {
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ }
+
+ if (zp->z_active == 0) {
+ zp->z_active = 1;
+ if (list_link_active(&zp->z_zcache_node)) {
+ mutex_enter(&zp->z_zcache_state->mtx);
+ list_remove(&zp->z_zcache_state->list, zp);
+ zp->z_zcache_state->lcnt -= 1;
+ mutex_exit(&zp->z_zcache_state->mtx);
+ }
+ }
+ VN_HOLD(ZTOV(zp));
+ mutex_exit(&zp->z_lock);
+ zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+ *zpp = zp;
+ return (0);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ zfs_znode_dmu_init(zp);
+ zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+ *zpp = zp;
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ ASSERT3U(error, ==, 0);
+ }
+ if (zp->z_zcache_state) {
+ ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+ atomic_add_64(&zp->z_zcache_state->cnt, -1);
+ }
+ error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+ zp->z_dbuf_held = 0;
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+ dmu_buf_rele(zp->z_dbuf);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&vp->v_lock);
+ vp->v_count--;
+ if (vp->v_count > 0 || vn_has_cached_data(vp)) {
+ /*
+ * If the hold count is greater than zero, somebody has
+ * obtained a new reference on this znode while we were
+ * processing it here, so we are done. If we still have
+ * mapped pages then we are also done, since we don't
+ * want to inactivate the znode until the pages get pushed.
+ *
+ * XXX - if vn_has_cached_data(vp) is true, but count == 0,
+ * this seems like it would leave the znode hanging with
+ * no chance to go inactive...
+ */
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ return;
+ }
+ mutex_exit(&vp->v_lock);
+ zp->z_active = 0;
+
+ /*
+ * If this was the last reference to a file with no links,
+ * remove the file from the file system.
+ */
+ if (zp->z_reap) {
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+ atomic_add_64(&zp->z_zcache_state->cnt, -1);
+ zp->z_zcache_state = NULL;
+ /* XATTR files are not put on the delete queue */
+ if (zp->z_phys->zp_flags & ZFS_XATTR) {
+ zfs_rmnode(zp);
+ } else {
+ mutex_enter(&zfsvfs->z_delete_head.z_mutex);
+ list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp);
+ zfsvfs->z_delete_head.z_znode_count++;
+ cv_broadcast(&zfsvfs->z_delete_head.z_cv);
+ mutex_exit(&zfsvfs->z_delete_head.z_mutex);
+ }
+ VFS_RELE(zfsvfs->z_vfs);
+ return;
+ }
+
+ /*
+ * If the file system for this znode is no longer mounted,
+ * evict the znode now, don't put it in the cache.
+ */
+ if (zfsvfs->z_unmounted1) {
+ zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
+ return;
+ }
+
+ /* put znode on evictable list */
+ mutex_enter(&zp->z_zcache_state->mtx);
+ list_insert_head(&zp->z_zcache_state->list, zp);
+ zp->z_zcache_state->lcnt += 1;
+ mutex_exit(&zp->z_zcache_state->mtx);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ kmem_cache_free(znode_cache, zp);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ gethrestime(&now);
+
+ if (tx) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+ if (flag & AT_MTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+
+ if (flag & AT_CTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ * 1 - Only the ACCESS time is ever updated outside of a transaction.
+ * 2 - Multiple consecutive updates will be collapsed into a single
+ * znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ mutex_enter(&zp->z_lock);
+ zfs_time_stamper_locked(zp, flag, tx);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file. This may involve migrating data
+ * from the bonus buffer into a data block (when we grow beyond the
+ * bonus buffer data area).
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+int
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ ASSERT(rw_write_held(&zp->z_grow_lock));
+
+ if (size <= zp->z_blksz)
+ return (0);
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ return (0);
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+ if (error == ENOTSUP)
+ return (0);
+ ASSERT3U(error, ==, 0);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+
+ return (0);
+}
+
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage(). E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+ int flags, cred_t *cr)
+{
+ ASSERT(0);
+ return (0);
+}
+
+/*
+ * Free space in a file. Currently, this function only
+ * supports freeing space at the end of the file.
+ *
+ * IN: zp - znode of file to free data in.
+ * from - start of section to free.
+ * len - length of section to free (0 => to EOF).
+ * flag - current file open mode flags.
+ * tx - open transaction.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
+ cred_t *cr)
+{
+ vnode_t *vp = ZTOV(zp);
+ uint64_t size = zp->z_phys->zp_size;
+ uint64_t end = from + len;
+ int have_grow_lock, error;
+
+ have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (len == 0 && size == from) {
+ return (0);
+ }
+
+ /*
+ * Check for any locks in the region to be freed.
+ */
+ if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+ uint64_t start;
+
+ if (size > from)
+ start = from;
+ else
+ start = size;
+ if (error = chklock(vp, FWRITE, start, 0, flag, NULL))
+ return (error);
+ }
+
+ if (end > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ uint64_t new_blksz;
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ error = zfs_grow_blocksize(zp, new_blksz, tx);
+ ASSERT(error == 0);
+ }
+ if (end > size || len == 0)
+ zp->z_phys->zp_size = end;
+ if (from > size)
+ return (0);
+
+ if (have_grow_lock)
+ rw_downgrade(&zp->z_grow_lock);
+ /*
+ * Clear any mapped pages in the truncated region.
+ */
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ if (vn_has_cached_data(vp)) {
+ page_t *pp;
+ uint64_t start = from & PAGEMASK;
+ int off = from & PAGEOFFSET;
+
+ if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
+ /*
+ * We need to zero a partial page.
+ */
+ pagezero(pp, off, PAGESIZE - off);
+ start += PAGESIZE;
+ page_unlock(pp);
+ }
+ error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
+ B_INVAL | B_TRUNC, cr);
+ ASSERT(error == 0);
+ }
+ rw_exit(&zp->z_map_lock);
+
+ if (!have_grow_lock)
+ rw_enter(&zp->z_grow_lock, RW_READER);
+
+ if (len == 0)
+ len = -1;
+ else if (end > size)
+ len = size - from;
+ dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+
+ if (!have_grow_lock)
+ rw_exit(&zp->z_grow_lock);
+
+ return (0);
+}
+
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+{
+ zfsvfs_t zfsvfs;
+ uint64_t moid, doid, roid = 0;
+ uint64_t version = ZFS_VERSION;
+ int error;
+ znode_t *rootzp = NULL;
+ vnode_t *vp;
+ vattr_t vattr;
+
+ /*
+ * First attempt to create master node.
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+
+ error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create a delete queue.
+ */
+ doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = 0;
+ vattr.va_gid = 3;
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ rootzp->z_zfsvfs = &zfsvfs;
+ rootzp->z_active = 1;
+ rootzp->z_reap = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_dbuf_held = 0;
+
+ vp = ZTOV(rootzp);
+ vn_reinit(vp);
+ vp->v_type = VDIR;
+
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+ zfsvfs.z_os = os;
+ zfsvfs.z_assign = TXG_NOWAIT;
+ zfsvfs.z_parent = &zfsvfs;
+
+ mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
+ ASSERT3U(rootzp->z_id, ==, roid);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+ ASSERT(error == 0);
+
+ ZTOV(rootzp)->v_count = 0;
+ kmem_cache_free(znode_cache, rootzp);
+}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
new file mode 100644
index 0000000000..1adc8ca3df
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -0,0 +1,1242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * - ZIL header
+ * - ZIL blocks
+ * - ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * These global ZIL switches affect all pools
+ */
+int zil_disable = 0; /* disable intent logging */
+int zil_always = 0; /* make every transaction synchronous */
+int zil_purge = 0; /* at pool open, just throw everything away */
+int zil_noflush = 0; /* don't flush write cache buffers on disks */
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = x1;
+ const dva_t *dva2 = x2;
+
+ if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+ return (-1);
+ if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+ return (1);
+
+ if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+ return (-1);
+ if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+ return (1);
+
+ return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+ avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+ offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+ zil_dva_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_dva_node_t));
+
+ avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+ zil_dva_node_t *zn;
+ avl_index_t where;
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (EEXIST);
+
+ zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
+{
+ uint64_t blksz = BP_GET_LSIZE(bp);
+ zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
+ zio_cksum_t cksum;
+ int error;
+
+ error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
+ NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+ if (error) {
+ dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
+ zilog, bp, error);
+ return (error);
+ }
+
+ if (BP_SHOULD_BYTESWAP(bp))
+ byteswap_uint64_array(buf, blksz);
+
+ /*
+ * Sequence numbers should be... sequential. The checksum verifier for
+ * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
+ */
+ cksum = bp->blk_cksum;
+ cksum.zc_word[3]++;
+ if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
+ dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
+ return (ESTALE);
+ }
+
+ if (BP_IS_HOLE(&ztp->zit_next_blk)) {
+ dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
+ return (ENOENT);
+ }
+
+ if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
+ dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
+ return (EOVERFLOW);
+ }
+
+ dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
+
+ return (0);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+void
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+ blkptr_t blk;
+ char *lrbuf, *lrp;
+ zil_trailer_t *ztp;
+ int reclen, error;
+
+ blk = zilog->zl_header->zh_log;
+ if (BP_IS_HOLE(&blk))
+ return;
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ */
+ zil_dva_tree_init(&zilog->zl_dva_tree);
+ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ for (;;) {
+ error = zil_read_log_block(zilog, &blk, lrbuf);
+
+ if (parse_blk_func != NULL)
+ parse_blk_func(zilog, &blk, arg, txg);
+
+ if (error)
+ break;
+
+ ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ blk = ztp->zit_next_blk;
+
+ if (parse_lr_func == NULL)
+ continue;
+
+ for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ parse_lr_func(zilog, lr, arg, txg);
+ }
+ }
+ zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+ zil_dva_tree_fini(&zilog->zl_dva_tree);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ spa_t *spa = zilog->zl_spa;
+ int err;
+
+ dprintf_bp(bp, "first_txg %llu: ", first_txg);
+
+ /*
+ * Claim log block if not already committed and not already claimed.
+ */
+ if (bp->blk_birth >= first_txg &&
+ zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+ err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+ ASSERT(err == 0);
+ }
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+ zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ if (bp->blk_birth >= claim_txg &&
+ !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+ (void) arc_free(NULL, zilog->zl_spa,
+ dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+ }
+ }
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ uint64_t txg;
+ dmu_tx_t *tx;
+ blkptr_t blk;
+ int error;
+
+ ASSERT(zilog->zl_header->zh_claim_txg == 0);
+ ASSERT(zilog->zl_header->zh_replay_seq == 0);
+
+ /*
+ * Initialize the log header block.
+ */
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ /*
+ * Allocate the first log block and assign its checksum verifier.
+ */
+ error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+ ZIL_MIN_BLKSZ, &blk, txg);
+ if (error == 0) {
+ ZIO_SET_CHECKSUM(&blk.blk_cksum,
+ spa_get_random(-1ULL), spa_get_random(-1ULL),
+ dmu_objset_id(zilog->zl_os), 1ULL);
+
+ /*
+ * Allocate a log write buffer (lwb) for the first log block.
+ */
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = blk;
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+ lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_seq = 0;
+ lwb->lwb_state = UNWRITTEN;
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ */
+void
+zil_destroy(zilog_t *zilog)
+{
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ mutex_enter(&zilog->zl_destroy_lock);
+
+ if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
+ mutex_exit(&zilog->zl_destroy_lock);
+ return;
+ }
+
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
+ zilog->zl_header->zh_claim_txg);
+ zilog->zl_destroy_txg = txg;
+
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+ mutex_exit(&zilog->zl_destroy_lock);
+}
+
+void
+zil_claim(char *osname, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ uint64_t first_txg = dmu_tx_get_txg(tx);
+ zilog_t *zilog;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't process intent log for %s", osname);
+ return;
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zilog->zl_header;
+
+ /*
+ * Claim all log blocks if we haven't already done so.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ zh->zh_claim_txg = first_txg;
+ zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
+ tx, first_txg);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_close(os);
+}
+
+void
+zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq)
+{
+ zil_vdev_t *zv;
+
+ if (zil_noflush)
+ return;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+ zv->vdev = vdev;
+ zv->seq = seq;
+ list_insert_tail(&zilog->zl_vdev_list, zv);
+}
+
+
+void
+zil_flush_vdevs(zilog_t *zilog, uint64_t seq)
+{
+ vdev_t *vd;
+ zil_vdev_t *zv, *zv2;
+ zio_t *zio;
+ spa_t *spa;
+ uint64_t vdev;
+
+ if (zil_noflush)
+ return;
+
+ ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+ spa = zilog->zl_spa;
+ zio = NULL;
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL &&
+ zv->seq <= seq) {
+ vdev = zv->vdev;
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+
+ /*
+ * remove all chained entries <= seq with same vdev
+ */
+ zv = list_head(&zilog->zl_vdev_list);
+ while (zv && zv->seq <= seq) {
+ zv2 = list_next(&zilog->zl_vdev_list, zv);
+ if (zv->vdev == vdev) {
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ zv = zv2;
+ }
+
+ /* flush the write cache for this vdev */
+ mutex_exit(&zilog->zl_lock);
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ vd = vdev_lookup_top(spa, vdev);
+ ASSERT(vd);
+ (void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ mutex_enter(&zilog->zl_lock);
+ }
+
+ /*
+ * Wait for all the flushes to complete. Not all devices actually
+ * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+ */
+ if (zio != NULL)
+ (void) zio_wait(zio);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *prev;
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+ uint64_t max_seq;
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ txg_rele_to_sync(&lwb->lwb_txgh);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_buf = NULL;
+ if (zio->io_error) {
+ zilog->zl_log_error = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+ cv_broadcast(&zilog->zl_cv_seq);
+ return;
+ }
+
+ prev = list_prev(&zilog->zl_lwb_list, lwb);
+ if (prev && prev->lwb_state != SEQ_COMPLETE) {
+ /* There's an unwritten buffer in the chain before this one */
+ lwb->lwb_state = SEQ_INCOMPLETE;
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+
+ max_seq = lwb->lwb_seq;
+ lwb->lwb_state = SEQ_COMPLETE;
+ /*
+ * We must also follow up the chain for already written buffers
+ * to see if we can set zl_ss_seq even higher.
+ */
+ while (lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+ if (lwb->lwb_state != SEQ_INCOMPLETE)
+ break;
+ lwb->lwb_state = SEQ_COMPLETE;
+ /* lwb_seq will be zero if we've written an empty buffer */
+ if (lwb->lwb_seq) {
+ ASSERT3U(max_seq, <, lwb->lwb_seq);
+ max_seq = lwb->lwb_seq;
+ }
+ }
+ zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+ mutex_exit(&zilog->zl_lock);
+ cv_broadcast(&zilog->zl_cv_seq);
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb;
+ zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ uint64_t txg;
+ uint64_t zil_blksz;
+ int error;
+
+ ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ */
+ txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+ txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+ /*
+ * Pick a ZIL blocksize based upon the size of the outstanding
+ * in-memory transactions, or if none the same size as the
+ * last block.
+ */
+ if (zilog->zl_itx_list_sz) {
+ zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp);
+ zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
+ if (zil_blksz > ZIL_MAX_BLKSZ)
+ zil_blksz = ZIL_MAX_BLKSZ;
+ zilog->zl_prev_blk_sz = zil_blksz;
+ } else {
+ zil_blksz = zilog->zl_prev_blk_sz;
+ }
+
+ error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+ zil_blksz, &ztp->zit_next_blk, txg);
+ if (error) {
+ txg_rele_to_sync(&lwb->lwb_txgh);
+ return (NULL);
+ }
+
+ ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
+ ztp->zit_next_blk.blk_cksum.zc_word[3]++;
+
+ /*
+ * Allocate a new log write buffer (lwb).
+ */
+ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+ nlwb->lwb_zilog = zilog;
+ nlwb->lwb_blk = ztp->zit_next_blk;
+ nlwb->lwb_nused = 0;
+ nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+ nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+ nlwb->lwb_max_txg = txg;
+ nlwb->lwb_seq = 0;
+ nlwb->lwb_state = UNWRITTEN;
+
+ /*
+ * Put new lwb at the end of the log chain,
+ * and record the vdev for later flushing
+ */
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, nlwb);
+ zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))),
+ lwb->lwb_seq);
+ mutex_exit(&zilog->zl_lock);
+
+ /*
+ * write the old log block
+ */
+ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+ zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
+ &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+
+ return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrc = &itx->itx_lr; /* common log record */
+ uint64_t seq = lrc->lrc_seq;
+ uint64_t txg = lrc->lrc_txg;
+ uint64_t reclen = lrc->lrc_reclen;
+ int error;
+
+ if (lwb == NULL)
+ return (NULL);
+ ASSERT(lwb->lwb_buf != NULL);
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+ if (!itx->itx_data_copied &&
+ (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) {
+ if (error != ENOENT && error != EALREADY) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+ zil_add_vdev(zilog,
+ DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))),
+ seq);
+ mutex_exit(&zilog->zl_lock);
+ return (lwb);
+ }
+ mutex_enter(&zilog->zl_lock);
+ zil_add_vdev(zilog,
+ DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq);
+ mutex_exit(&zilog->zl_lock);
+ return (lwb);
+ }
+ }
+
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ */
+ if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+ lwb = zil_lwb_write_start(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+ mutex_exit(&zilog->zl_lock);
+ return (lwb);
+ }
+ }
+
+ bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+ lwb->lwb_nused += reclen;
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+ ASSERT3U(lwb->lwb_seq, <, seq);
+ lwb->lwb_seq = seq;
+ ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(int txtype, size_t lrsize)
+{
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t));
+
+ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+
+ return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t seq;
+
+ ASSERT(itx->itx_lr.lrc_seq == 0);
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+ uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+ uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ uint64_t max_seq = 0;
+ itx_t *itx;
+
+ mutex_enter(&zilog->zl_lock);
+ while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+ itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+ list_remove(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+ ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq);
+ max_seq = itx->itx_lr.lrc_seq;
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ }
+ if (max_seq > zilog->zl_ss_seq) {
+ zilog->zl_ss_seq = max_seq;
+ cv_broadcast(&zilog->zl_cv_seq);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_clean(zilog_t *zilog)
+{
+ /*
+ * Check for any log blocks that can be freed.
+ * Log blocks are only freed when the log block allocation and
+ * log records contained within are both known to be committed.
+ */
+ mutex_enter(&zilog->zl_lock);
+ if (list_head(&zilog->zl_itx_list) != NULL)
+ (void) taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)
+{
+ uint64_t txg;
+ uint64_t max_seq;
+ uint64_t reclen;
+ itx_t *itx;
+ lwb_t *lwb;
+ spa_t *spa;
+
+ if (zilog == NULL || seq == 0 ||
+ ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always))
+ return;
+
+ spa = zilog->zl_spa;
+ mutex_enter(&zilog->zl_lock);
+
+ seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+
+ for (;;) {
+ if (zilog->zl_ss_seq >= seq) { /* already on stable storage */
+ cv_signal(&zilog->zl_cv_write);
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+
+ if (zilog->zl_writer == B_FALSE) /* no one writing, do it */
+ break;
+
+ cv_wait(&zilog->zl_cv_write, &zilog->zl_lock);
+ }
+
+ zilog->zl_writer = B_TRUE;
+ max_seq = 0;
+
+ if (zilog->zl_suspend) {
+ lwb = NULL;
+ } else {
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ mutex_exit(&zilog->zl_lock);
+ zil_create(zilog);
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ }
+ }
+
+ /*
+ * Loop through in-memory log transactions filling log blocks,
+ * until we reach the given sequence number and there's no more
+ * room in the write buffer.
+ */
+ for (;;) {
+ itx = list_head(&zilog->zl_itx_list);
+ if (itx == NULL)
+ break;
+
+ reclen = itx->itx_lr.lrc_reclen;
+ if ((itx->itx_lr.lrc_seq > seq) &&
+ ((lwb == NULL) || (lwb->lwb_nused + reclen >
+ ZIL_BLK_DATA_SZ(lwb))))
+ break;
+
+ list_remove(&zilog->zl_itx_list, itx);
+ txg = itx->itx_lr.lrc_txg;
+ ASSERT(txg);
+
+ mutex_exit(&zilog->zl_lock);
+ if (txg > spa_last_synced_txg(spa) ||
+ txg > spa_freeze_txg(spa))
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+ else
+ max_seq = itx->itx_lr.lrc_seq;
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_itx_list_sz -= reclen;
+ }
+
+ mutex_exit(&zilog->zl_lock);
+
+ /* write the last block out */
+ if (lwb != NULL && lwb->lwb_nused != 0)
+ lwb = zil_lwb_write_start(zilog, lwb);
+
+ /* wake up others waiting to start a write */
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_writer = B_FALSE;
+ cv_signal(&zilog->zl_cv_write);
+
+ if (max_seq > zilog->zl_ss_seq) {
+ zilog->zl_ss_seq = max_seq;
+ cv_broadcast(&zilog->zl_cv_seq);
+ }
+ /*
+ * Wait if necessary for our seq to be committed.
+ */
+ if (lwb) {
+ while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
+ cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+ zil_flush_vdevs(zilog, seq);
+ }
+ if (zilog->zl_log_error || lwb == NULL) {
+ zilog->zl_log_error = 0;
+ max_seq = zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+ cv_broadcast(&zilog->zl_cv_seq);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ lwb_t *lwb;
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+ if (zilog->zl_destroy_txg == txg) {
+ bzero(zilog->zl_header, sizeof (zil_header_t));
+ bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+ zilog->zl_destroy_txg = 0;
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ for (;;) {
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free_blk(spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ zilog->zl_header->zh_log = lwb->lwb_blk;
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ;
+
+ list_create(&zilog->zl_itx_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
+ offsetof(zil_vdev_t, vdev_seq_node));
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ zil_vdev_t *zv;
+
+ zilog->zl_stop_sync = 1;
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ list_destroy(&zilog->zl_lwb_list);
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ list_destroy(&zilog->zl_vdev_list);
+
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+ list_destroy(&zilog->zl_itx_list);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ zilog->zl_get_data = get_data;
+ zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+ 2, 2, TASKQ_PREPOPULATE);
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ taskq_destroy(zilog->zl_clean_taskq);
+ zilog->zl_clean_taskq = NULL;
+ zilog->zl_get_data = NULL;
+
+ zil_itx_clean(zilog);
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+ lwb_t *lwb;
+
+ mutex_enter(&zilog->zl_lock);
+ if (zilog->zl_header->zh_claim_txg != 0) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ return (EBUSY);
+ }
+ zilog->zl_suspend++;
+ mutex_exit(&zilog->zl_lock);
+
+ zil_commit(zilog, UINT64_MAX, FSYNC);
+
+ mutex_enter(&zilog->zl_lock);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ if (lwb->lwb_buf != NULL) {
+ /*
+ * Wait for the buffer if it's in the process of
+ * being written.
+ */
+ if ((lwb->lwb_seq != 0) &&
+ (lwb->lwb_state != SEQ_COMPLETE)) {
+ cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+ continue;
+ }
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ }
+ list_remove(&zilog->zl_lwb_list, lwb);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ zil_destroy(zilog);
+
+ return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+ objset_t *zr_os;
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ void (*zr_rm_sync)(void *arg);
+ uint64_t *zr_txgp;
+ boolean_t zr_byteswap;
+ char *zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ int pass, error;
+
+ if (zilog->zl_stop_replay)
+ return;
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return;
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lrbuf, reclen);
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different data types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ lr_write_t *lrw = (lr_write_t *)lr;
+ blkptr_t *wbp = &lrw->lr_blkptr;
+ uint64_t wlen = lrw->lr_length;
+ char *wbuf = zr->zr_lrbuf + reclen;
+
+ if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
+ bzero(wbuf, wlen);
+ } else {
+ /*
+ * A subsequent write may have overwritten this block,
+ * in which case wbp may have been been freed and
+ * reallocated, and our read of wbp may fail with a
+ * checksum error. We can safely ignore this because
+ * the later write will provide the correct data.
+ */
+ (void) zio_wait(zio_read(NULL, zilog->zl_spa,
+ wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+ (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+ }
+ }
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header to reflect the fact that we did so.
+ * We use the DMU's ability to assign into a specific txg to do this.
+ */
+ for (pass = 1; /* CONSTANTCONDITION */; pass++) {
+ uint64_t replay_txg;
+ dmu_tx_t *replay_tx;
+
+ replay_tx = dmu_tx_create(zr->zr_os);
+ error = dmu_tx_assign(replay_tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(replay_tx);
+ break;
+ }
+
+ replay_txg = dmu_tx_get_txg(replay_tx);
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+ error = EINVAL;
+ } else {
+ /*
+ * On the first pass, arrange for the replay vector
+ * to fail its dmu_tx_assign(). That's the only way
+ * to ensure that those code paths remain well tested.
+ */
+ *zr->zr_txgp = replay_txg - (pass == 1);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+ zr->zr_byteswap);
+ *zr->zr_txgp = TXG_NOWAIT;
+ }
+
+ if (error == 0) {
+ dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+ zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+ lr->lrc_seq;
+ }
+
+ dmu_tx_commit(replay_tx);
+
+ if (error != ERESTART)
+ break;
+
+ if (pass != 1)
+ txg_wait_open(spa_get_dsl(zilog->zl_spa),
+ replay_txg + 1);
+
+ dprintf("pass %d, retrying\n", pass);
+ }
+
+ if (error) {
+ char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dmu_objset_name(zr->zr_os, name);
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu\n",
+ error, name,
+ (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+ zilog->zl_stop_replay = 1;
+ kmem_free(name, MAXNAMELEN);
+ }
+
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg commits,
+ * so a subsequent claim can spuriously fail with EEXIST.
+ * To prevent this, if we might have removed an object,
+ * wait for the delete thread to delete it, and then
+ * wait for the transaction group to sync.
+ */
+ if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
+ if (zr->zr_rm_sync != NULL)
+ zr->zr_rm_sync(zr->zr_arg);
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ }
+}
+
+/*
+ * If this dataset has an intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ zil_replay_arg_t zr;
+
+ zr.zr_os = os;
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_rm_sync = rm_sync;
+ zr.zr_txgp = txgp;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
+ zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ if (rm_sync != NULL)
+ rm_sync(arg);
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_stop_replay = 0;
+ zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+ zilog->zl_header->zh_claim_txg);
+ kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
new file mode 100644
index 0000000000..7323292859
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -0,0 +1,1698 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+static void zio_vdev_io_enter(zio_t *zio);
+static void zio_vdev_io_exit(zio_t *zio);
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+ 0, /* ZIO_PRIORITY_NOW */
+ 0, /* ZIO_PRIORITY_SYNC_READ */
+ 0, /* ZIO_PRIORITY_SYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 4, /* ZIO_PRIORITY_FREE */
+ 0, /* ZIO_PRIORITY_CACHE_FILL */
+ 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 10, /* ZIO_PRIORITY_RESILVER */
+ 20, /* ZIO_PRIORITY_SCRUB */
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+ "null", "read", "write", "free", "claim", "ioctl" };
+
+/* At or above this size, force gang blocking - for testing */
+uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
+
+typedef struct zio_sync_pass {
+ int zp_defer_free; /* defer frees after this pass */
+ int zp_dontcompress; /* don't compress after this pass */
+ int zp_rewrite; /* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+ 1, /* zp_defer_free */
+ 4, /* zp_dontcompress */
+ 1, /* zp_rewrite */
+};
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+void
+zio_init(void)
+{
+ size_t c;
+
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
+ * for each quarter-power of 2. For large buffers, we want
+ * a cache for each multiple of PAGESIZE.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+
+ while (p2 & (p2 - 1))
+ p2 &= p2 - 1;
+
+ if (size <= 4 * SPA_MINBLOCKSIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (P2PHASE(size, PAGESIZE) == 0) {
+ align = PAGESIZE;
+ } else if (P2PHASE(size, p2 >> 2) == 0) {
+ align = p2 >> 2;
+ }
+
+ if (align != 0) {
+ char name[30];
+ (void) sprintf(name, "zio_buf_%lu", size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, 0);
+ dprintf("creating cache for size %5lx align %5lx\n",
+ size, align);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+ }
+}
+
+void
+zio_fini(void)
+{
+ size_t c;
+ kmem_cache_t *last_cache = NULL;
+
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ if (zio_buf_cache[c] != last_cache) {
+ last_cache = zio_buf_cache[c];
+ kmem_cache_destroy(zio_buf_cache[c]);
+ }
+ zio_buf_cache[c] = NULL;
+ }
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_data = data;
+ zt->zt_size = size;
+ zt->zt_bufsize = bufsize;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_data = data;
+ zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+ zio_transform_t *zt = zio->io_transform_stack;
+
+ *data = zt->zt_data;
+ *size = zt->zt_size;
+ *bufsize = zt->zt_bufsize;
+
+ zio->io_transform_stack = zt->zt_next;
+ kmem_free(zt, sizeof (zio_transform_t));
+
+ if ((zt = zio->io_transform_stack) != NULL) {
+ zio->io_data = zt->zt_data;
+ zio->io_size = zt->zt_size;
+ }
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+ void *data;
+ uint64_t size, bufsize;
+
+ ASSERT(zio->io_transform_stack != NULL);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ while (zio->io_transform_stack != NULL) {
+ zio_buf_free(data, bufsize);
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ }
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+ zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ zio->io_parent = pio;
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ if (bp != NULL) {
+ zio->io_bp = bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ /* XXBP - Need to inherit this when it matters */
+ zio->io_dva_index = 0;
+ }
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_stage = stage;
+ zio->io_pipeline = pipeline;
+ zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
+ zio->io_timestamp = lbolt64;
+ zio->io_flags = flags;
+ zio_push_transform(zio, data, size, size);
+
+ if (pio == NULL) {
+ if (!(flags & ZIO_FLAG_CONFIG_HELD))
+ spa_config_enter(zio->io_spa, RW_READER);
+ zio->io_root = zio;
+ } else {
+ zio->io_root = pio->io_root;
+
+ mutex_enter(&pio->io_lock);
+ if (stage < ZIO_STAGE_READY)
+ pio->io_children_notready++;
+ pio->io_children_notdone++;
+ zio->io_sibling_next = pio->io_child;
+ zio->io_sibling_prev = NULL;
+ if (pio->io_child != NULL)
+ pio->io_child->io_sibling_prev = zio;
+ pio->io_child = zio;
+ mutex_exit(&pio->io_lock);
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+ int flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+ return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_t *zio;
+ dva_t *dva;
+
+ ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+ zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ bp = zio->io_bp;
+ dva = ZIO_GET_DVA(zio);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(csize);
+
+ zio_push_transform(zio, cbuf, csize, csize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+ }
+
+ if (DVA_GET_GANG(dva)) {
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+ checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+ ASSERT(compress >= ZIO_COMPRESS_OFF &&
+ compress < ZIO_COMPRESS_FUNCTIONS);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = compress;
+
+ if (compress != ZIO_COMPRESS_OFF)
+ zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
+
+ if (bp->blk_birth != txg) {
+ /* XXX the bp usually (always?) gets re-zeroed later */
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ /* XXBP - We need to re-evaluate when to insert pipeline stages */
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg == spa->spa_syncing_txg &&
+ spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+ bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+ return (zio_null(pio, spa, NULL, NULL, 0));
+ }
+
+ /* XXBP - We need to re-evaluate when to insert pipeline stages */
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+ ASSERT3U(spa_first_txg(spa), <=, txg);
+
+ /* XXBP - We need to re-evaluate when to insert pipeline stages */
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_IOCTL, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ done, private, priority, flags));
+ }
+
+ return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+ int checksum)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ ASSERT(size <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+ ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ BP_ZERO(bp);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ if (checksum != ZIO_CHECKSUM_OFF)
+ ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_block_tail_t *zbt;
+ void *wbuf;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ zio->io_bp = &zio->io_bp_copy;
+ zio->io_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_zbt) {
+ /*
+ * zbt checksums are necessarily destructive -- they modify
+ * one word of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places.
+ */
+ wbuf = zio_buf_alloc(size);
+ bcopy(data, wbuf, size);
+ zio_push_transform(zio, wbuf, size, size);
+
+ zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+ zbt->zbt_cksum = blk.blk_cksum;
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us. It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ void *data, uint64_t size, int type, int priority, int flags,
+ zio_done_func_t *done, void *private)
+{
+ uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *cio;
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ }
+
+ cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+ done, private, type, priority,
+ (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+ ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+
+ cio->io_vd = vd;
+ cio->io_offset = offset;
+
+ return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ int error;
+
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+ zio->io_waiter = curthread;
+
+ zio_next_stage_async(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_stalled != ZIO_STAGE_DONE)
+ cv_wait(&zio->io_cv, &zio->io_lock);
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+
+ kmem_free(zio, sizeof (zio_t));
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ zio_next_stage_async(zio);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static void
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ mutex_enter(&zio->io_lock);
+ if (*countp == 0) {
+ ASSERT(zio->io_stalled == 0);
+ mutex_exit(&zio->io_lock);
+ zio_next_stage(zio);
+ } else {
+ if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_exit(zio);
+ zio->io_stalled = stage;
+ mutex_exit(&zio->io_lock);
+ }
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ zio_t *pio = zio->io_parent;
+
+ mutex_enter(&pio->io_lock);
+ if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ pio->io_error = zio->io_error;
+ if (--*countp == 0 && pio->io_stalled == stage) {
+ if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_enter(pio);
+ pio->io_stalled = 0;
+ mutex_exit(&pio->io_lock);
+ zio_next_stage_async(pio);
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+static void
+zio_wait_children_ready(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &zio->io_children_notready);
+}
+
+void
+zio_wait_children_done(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &zio->io_children_notdone);
+}
+
+static void
+zio_ready(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+
+ if (pio != NULL)
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &pio->io_children_notready);
+
+ if (zio->io_bp)
+ zio->io_bp_copy = *zio->io_bp;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ vdev_t *vd = zio->io_vd;
+ char blkbuf[300];
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
+ if (bp != NULL) {
+ ASSERT(bp->blk_pad[0] == 0);
+ ASSERT(bp->blk_pad[1] == 0);
+ ASSERT(bp->blk_pad[2] == 0);
+ ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ }
+
+ if (vd != NULL)
+ vdev_stat_update(zio);
+
+ if (zio->io_error) {
+ sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+ dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
+ zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf, zio->io_error);
+ }
+
+ if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
+ sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+ dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
+ "partial write",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf, zio->io_numerrors);
+ }
+
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+ panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
+ zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf, zio->io_error);
+ }
+
+ zio_clear_transform_stack(zio);
+
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ ASSERT(zio->io_delegate_list == NULL);
+ ASSERT(zio->io_delegate_next == NULL);
+
+ if (pio != NULL) {
+ zio_t *next, *prev;
+
+ mutex_enter(&pio->io_lock);
+ next = zio->io_sibling_next;
+ prev = zio->io_sibling_prev;
+ if (next != NULL)
+ next->io_sibling_prev = prev;
+ if (prev != NULL)
+ prev->io_sibling_next = next;
+ if (pio->io_child == zio)
+ pio->io_child = next;
+ mutex_exit(&pio->io_lock);
+
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &pio->io_children_notdone);
+ }
+
+ if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
+ spa_config_exit(spa);
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio->io_stalled = zio->io_stage;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ kmem_free(zio, sizeof (zio_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static void
+zio_write_compress(zio_t *zio)
+{
+ int compress = zio->io_compress;
+ blkptr_t *bp = zio->io_bp;
+ void *cbuf;
+ uint64_t lsize = zio->io_size;
+ uint64_t csize = lsize;
+ uint64_t cbufsize = 0;
+ int pass;
+
+ if (bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(zio->io_spa);
+ if (pass > zio_sync_pass.zp_dontcompress)
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT(BP_IS_HOLE(bp));
+ pass = 1;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF)
+ if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+ &cbuf, &csize, &cbufsize))
+ compress = ZIO_COMPRESS_OFF;
+
+ if (compress != ZIO_COMPRESS_OFF && csize != 0)
+ zio_push_transform(zio, cbuf, csize, cbufsize);
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to reallocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ pass > zio_sync_pass.zp_rewrite) {
+ ASSERT(csize != 0);
+ ASSERT3U(BP_GET_COMPRESS(bp), ==, compress);
+ ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+ } else {
+ if (bp->blk_birth == zio->io_txg) {
+ ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+ bzero(bp, sizeof (blkptr_t));
+ }
+ if (csize == 0) {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+ } else {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, csize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_read_decompress(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ void *data;
+ uint64_t size;
+ uint64_t bufsize;
+ int compress = BP_GET_COMPRESS(bp);
+
+ ASSERT(compress != ZIO_COMPRESS_OFF);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+
+ if (zio_decompress_data(compress, data, size,
+ zio->io_data, zio->io_size))
+ zio->io_error = EIO;
+
+ zio_buf_free(data, bufsize);
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_pipeline(zio_t *zio)
+{
+ /*
+ * By default, the pipeline assumes that we're dealing with a gang
+ * block. If we're not, strip out any gang-specific stages.
+ */
+ if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+ zio->io_pipeline &= ~ZIO_GANG_STAGES;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+ if (BP_SHOULD_BYTESWAP(zio->io_bp))
+ byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static void
+zio_get_gang_header(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+
+ zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+ NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_read_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_read(zio, zio->io_spa, gbp,
+ (char *)zio->io_data + loff, lsize, NULL, NULL,
+ zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_rewrite_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ ASSERT(gsize == gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+ zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority, zio->io_flags));
+ }
+
+ zio_push_transform(zio, gbh, gsize, gbufsize);
+ zio_wait_children_ready(zio);
+}
+
+static void
+zio_free_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_claim_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ dva_t *cdva = ZIO_GET_DVA(zio);
+ dva_t *pdva = ZIO_GET_DVA(pio);
+ uint64_t asize;
+
+ ASSERT(DVA_GET_GANG(pdva));
+
+ /* XXBP - Need to be careful here with multiple DVAs */
+ mutex_enter(&pio->io_lock);
+ asize = DVA_GET_ASIZE(pdva);
+ asize += DVA_GET_ASIZE(cdva);
+ DVA_SET_ASIZE(pdva, asize);
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_allocate_gang_members(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = ZIO_GET_DVA(zio);
+ zio_gbh_phys_t *gbh;
+ uint64_t resid = zio->io_size;
+ uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+ uint64_t gsize, loff, lsize;
+ uint32_t gbps_left;
+ int error;
+ int i;
+
+ gsize = SPA_GANGBLOCKSIZE;
+ gbps_left = SPA_GBH_NBLKPTRS;
+
+ error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+ if (error == ENOSPC)
+ panic("can't allocate gang block header");
+ ASSERT(error == 0);
+
+ DVA_SET_GANG(dva, 1);
+
+ bp->blk_birth = zio->io_txg;
+
+ gbh = zio_buf_alloc(gsize);
+ bzero(gbh, gsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size;
+ loff += lsize, resid -= lsize, gbps_left--, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ dva = &gbp->blk_dva[0];
+
+ ASSERT(gbps_left != 0);
+ maxalloc = MIN(maxalloc, resid);
+
+ while (resid <= maxalloc * gbps_left) {
+ error = metaslab_alloc(zio->io_spa, maxalloc, dva,
+ zio->io_txg);
+ if (error == 0)
+ break;
+ ASSERT3U(error, ==, ENOSPC);
+ if (maxalloc == SPA_MINBLOCKSIZE)
+ panic("really out of space");
+ maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+ }
+
+ if (resid <= maxalloc * gbps_left) {
+ lsize = maxalloc;
+ BP_SET_LSIZE(gbp, lsize);
+ BP_SET_PSIZE(gbp, lsize);
+ BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+ gbp->blk_birth = zio->io_txg;
+ zio_nowait(zio_rewrite(zio, zio->io_spa,
+ zio->io_checksum, zio->io_txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags));
+ } else {
+ lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+ ASSERT(lsize != SPA_MINBLOCKSIZE);
+ zio_nowait(zio_write_allocate(zio, zio->io_spa,
+ zio->io_checksum, zio->io_txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags));
+ }
+ }
+
+ ASSERT(resid == 0 && loff == zio->io_size);
+
+ zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+ zio_push_transform(zio, gbh, gsize, gsize);
+ zio_wait_children_done(zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static void
+zio_dva_allocate(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = ZIO_GET_DVA(zio);
+ int error;
+
+ ASSERT(BP_IS_HOLE(bp));
+
+ /* For testing, make some blocks above a certain size be gang blocks */
+ if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
+ zio_write_allocate_gang_members(zio);
+ return;
+ }
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+
+ if (error == 0) {
+ bp->blk_birth = zio->io_txg;
+ } else if (error == ENOSPC) {
+ if (zio->io_size == SPA_MINBLOCKSIZE)
+ panic("really, truly out of space");
+ zio_write_allocate_gang_members(zio);
+ return;
+ } else {
+ zio->io_error = error;
+ }
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_free(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = ZIO_GET_DVA(zio);
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ metaslab_free(zio->io_spa, dva, zio->io_txg);
+
+ BP_ZERO(bp);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_claim(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = ZIO_GET_DVA(zio);
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_translate(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ dva_t *dva = ZIO_GET_DVA(zio);
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+
+ ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
+
+ zio->io_offset = offset;
+
+ if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
+ zio->io_error = ENXIO;
+ else if (offset + zio->io_size > zio->io_vd->vdev_asize)
+ zio->io_error = EOVERFLOW;
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+static void
+zio_vdev_io_enter(zio_t *zio)
+{
+ vdev_t *tvd = zio->io_vd->vdev_top;
+
+ mutex_enter(&tvd->vdev_io_lock);
+ ASSERT(zio->io_pending.list_next == NULL);
+ list_insert_tail(&tvd->vdev_io_pending, zio);
+ mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_exit(zio_t *zio)
+{
+ vdev_t *tvd = zio->io_vd->vdev_top;
+
+ mutex_enter(&tvd->vdev_io_lock);
+ ASSERT(zio->io_pending.list_next != NULL);
+ list_remove(&tvd->vdev_io_pending, zio);
+ if (list_head(&tvd->vdev_io_pending) == NULL)
+ cv_broadcast(&tvd->vdev_io_cv);
+ mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_retry(void *vdarg)
+{
+ vdev_t *vd = vdarg;
+ zio_t *zio, *zq;
+
+ ASSERT(vd == vd->vdev_top);
+
+ /* XXPOLICY */
+ delay(hz);
+
+ vdev_reopen(vd, &zq);
+
+ while ((zio = zq) != NULL) {
+ zq = zio->io_retry_next;
+ zio->io_retry_next = NULL;
+ dprintf("async retry #%d for I/O to %s offset %llx\n",
+ zio->io_retries, vdev_description(vd), zio->io_offset);
+ zio_next_stage_async(zio);
+ }
+}
+
+static void
+zio_vdev_io_setup(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ /* XXPOLICY */
+ if (zio->io_retries == 0 && vd == vd->vdev_top)
+ zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+ zio->io_flags |= ZIO_FLAG_PHYSICAL;
+ zio->io_offset += VDEV_LABEL_START_SIZE;
+ }
+
+ zio_vdev_io_enter(zio);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_vdev_io_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0);
+ ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0);
+ ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size);
+ ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+ vdev_io_start(zio);
+
+ /* zio_next_stage_async() gets called from io completion interrupt */
+}
+
+static void
+zio_vdev_io_done(zio_t *zio)
+{
+ vdev_io_done(zio);
+}
+
+/* XXPOLICY */
+static boolean_t
+zio_should_retry(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio->io_error == 0)
+ return (B_FALSE);
+ if (zio->io_delegate_list != NULL)
+ return (B_FALSE);
+ if (vd != vd->vdev_top)
+ return (B_FALSE);
+ if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+ return (B_FALSE);
+ if (zio->io_retries > 300 &&
+ (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
+ return (B_FALSE);
+ if (zio->io_retries > 1 &&
+ (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static void
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+
+ zio_vdev_io_exit(zio);
+
+ ASSERT(zio->io_vsd == NULL);
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ */
+ /* XXPOLICY */
+ if (zio_should_retry(zio)) {
+ zio_t *zq;
+
+ ASSERT(tvd == vd);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
+
+ zio->io_retries++;
+ zio->io_error = 0;
+ zio->io_flags &= ZIO_FLAG_VDEV_INHERIT;
+ /* XXPOLICY */
+ zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+
+ dprintf("retry #%d for %s to %s offset %llx\n",
+ zio->io_retries, zio_type_name[zio->io_type],
+ vdev_description(vd), zio->io_offset);
+
+ /*
+ * If this is the first retry, do it immediately.
+ */
+ /* XXPOLICY */
+ if (zio->io_retries == 1) {
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ /*
+ * This was not the first retry, so go through the
+ * longer enqueue/delay/vdev_reopen() process.
+ */
+ mutex_enter(&tvd->vdev_io_lock);
+ ASSERT(zio->io_retry_next == NULL);
+ zio->io_retry_next = zq = tvd->vdev_io_retry;
+ tvd->vdev_io_retry = zio;
+ mutex_exit(&tvd->vdev_io_lock);
+ if (zq == NULL)
+ (void) taskq_dispatch(
+ tvd->vdev_spa->spa_vdev_retry_taskq,
+ zio_vdev_io_retry, tvd, TQ_SLEEP);
+ return;
+ }
+
+ zio_next_stage(zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static void
+zio_checksum_generate(zio_t *zio)
+{
+ int checksum = zio->io_checksum;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_checksum_generate(zio_t *zio)
+{
+ zio_cksum_t zc;
+ zio_gbh_phys_t *gbh = zio->io_data;
+
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+ ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+ zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+ zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_checksum_verify(zio_t *zio)
+{
+ if (zio->io_bp != NULL) {
+ zio->io_error = zio_checksum_error(zio);
+ if (zio->io_error) {
+ dprintf("bad checksum on vdev %s\n",
+ vdev_description(zio->io_vd));
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+ zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
+ zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
+ zcp->zc_word[2] = zio->io_bp->blk_birth;
+ zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef void zio_pipe_stage_t(zio_t *zio);
+
+static void
+zio_badop(zio_t *zio)
+{
+ panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
+}
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+ zio_badop,
+ zio_wait_children_ready,
+ zio_write_compress,
+ zio_checksum_generate,
+ zio_gang_pipeline,
+ zio_get_gang_header,
+ zio_rewrite_gang_members,
+ zio_free_gang_members,
+ zio_claim_gang_members,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_gang_checksum_generate,
+ zio_ready,
+ zio_dva_translate,
+ zio_vdev_io_setup,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_wait_children_done,
+ zio_checksum_verify,
+ zio_read_gang_members,
+ zio_read_decompress,
+ zio_done,
+ zio_badop
+};
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_next_stage(zio_t *zio)
+{
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ zio_pipeline[zio->io_stage](zio);
+}
+
+void
+zio_next_stage_async(zio_t *zio)
+{
+ taskq_t *tq;
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ /*
+ * For performance, we'll probably want two sets of task queues:
+ * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
+ * part is for read performance: since we have to make a pass over
+ * the data to checksum it anyway, we want to do this on the same CPU
+ * that issued the read, because (assuming CPU scheduling affinity)
+ * that thread is probably still there. Getting this optimization
+ * right avoids performance-hostile cache-to-cache transfers.
+ *
+ * Note that having two sets of task queues is also necessary for
+ * correctness: if all of the issue threads get bogged down waiting
+ * for dependent reads (e.g. metaslab freelist) to complete, then
+ * there won't be any threads available to service I/O completion
+ * interrupts.
+ */
+ if ((1U << zio->io_stage) & zio->io_async_stages) {
+ if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
+ tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+ else
+ tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
+ (void) taskq_dispatch(tq,
+ (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+ } else {
+ zio_pipeline[zio->io_stage](zio);
+ }
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
+ uint64_t txg)
+{
+ int error;
+
+ spa_config_enter(spa, RW_READER);
+
+ BP_ZERO(bp);
+
+ error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+
+ if (error == 0) {
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ bp->blk_birth = txg;
+ }
+
+ spa_config_exit(spa);
+
+ return (error);
+}
+
+/*
+ * Free an intent log block. We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+
+ dprintf_bp(bp, "txg %llu: ", txg);
+
+ spa_config_enter(spa, RW_READER);
+
+ metaslab_free(spa, BP_IDENTITY(bp), txg);
+
+ spa_config_exit(spa);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 0000000000..dc31527ce8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ NULL, NULL, 0, 0, "inherit",
+ NULL, NULL, 0, 0, "on",
+ zio_checksum_off, zio_checksum_off, 0, 0, "off",
+ zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "label",
+ zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "gang_header",
+ fletcher_2_native, fletcher_2_byteswap, 0, 1, "zilog",
+ fletcher_2_native, fletcher_2_byteswap, 0, 0, "fletcher2",
+ fletcher_4_native, fletcher_4_byteswap, 1, 0, "fletcher4",
+ zio_checksum_SHA256, zio_checksum_SHA256, 1, 0, "SHA256",
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t zbt_cksum;
+
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ if (ci->ci_zbt) {
+ *zcp = zbt->zbt_cksum;
+ zbt->zbt_magic = ZBT_MAGIC;
+ ci->ci_func[0](data, size, &zbt_cksum);
+ zbt->zbt_cksum = zbt_cksum;
+ } else {
+ ci->ci_func[0](data, size, zcp);
+ }
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = ZIO_GET_DVA(zio);
+ zio_cksum_t zc = bp->blk_cksum;
+ uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+ BP_GET_CHECKSUM(bp);
+ int byteswap = BP_SHOULD_BYTESWAP(bp);
+ void *data = zio->io_data;
+ uint64_t size = zio->io_size;
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (EINVAL);
+
+ if (ci->ci_zbt) {
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_set_gang_verifier(zio, &zc);
+
+ if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+ expected_cksum = zbt->zbt_cksum;
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ zbt->zbt_cksum = zc;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ ci->ci_func[1](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ } else {
+ expected_cksum = zbt->zbt_cksum;
+ zbt->zbt_cksum = zc;
+ ci->ci_func[0](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ }
+ zc = expected_cksum;
+ } else {
+ ASSERT(!DVA_GET_GANG(dva));
+ ci->ci_func[byteswap](data, size, &actual_cksum);
+ }
+
+ if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
+ (actual_cksum.zc_word[1] - zc.zc_word[1]) |
+ (actual_cksum.zc_word[2] - zc.zc_word[2]) |
+ (actual_cksum.zc_word[3] - zc.zc_word[3]))
+ return (ECKSUM);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 0000000000..51d85172bb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,134 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ NULL, NULL, "inherit",
+ NULL, NULL, "on",
+ NULL, NULL, "uncompressed",
+ lzjb_compress, lzjb_decompress, "lzjb",
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+ if (child == ZIO_COMPRESS_INHERIT)
+ return (parent);
+
+ if (child == ZIO_COMPRESS_ON)
+ return (ZIO_COMPRESS_ON_VALUE);
+
+ return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+ uint64_t *destsizep, uint64_t *destbufsizep)
+{
+ uint64_t *word, *word_end;
+ uint64_t ciosize, gapsize, destbufsize;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ char *dest;
+ uint_t allzero;
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(ci->ci_compress != NULL);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by setting *destsizep = 0.
+ */
+ allzero = 1;
+ word = src;
+ word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+ while (word < word_end) {
+ if (*word++ != 0) {
+ allzero = 0;
+ break;
+ }
+ }
+ if (allzero) {
+ *destp = NULL;
+ *destsizep = 0;
+ *destbufsizep = 0;
+ return (1);
+ }
+
+ /* Compress at least 12.5% */
+ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+ if (destbufsize == 0)
+ return (0);
+ dest = zio_buf_alloc(destbufsize);
+
+ ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+ (size_t)destbufsize);
+ if (ciosize > destbufsize) {
+ zio_buf_free(dest, destbufsize);
+ return (0);
+ }
+
+ /* Cool. We compressed at least as much as we were hoping to. */
+
+ /* For security, make sure we don't write random heap crap to disk */
+ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+ if (gapsize != 0) {
+ bzero(dest + ciosize, gapsize);
+ ciosize += gapsize;
+ }
+
+ ASSERT3U(ciosize, <=, destbufsize);
+ ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+ *destp = dest;
+ *destsizep = ciosize;
+ *destbufsizep = destbufsize;
+
+ return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize)
+{
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+ return (zio_compress_table[cpfunc].ci_decompress(src, dest,
+ srcsize, destsize));
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
new file mode 100644
index 0000000000..ceb9e24d72
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,793 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the ZFS-specific devfsadm link generator.
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/aio_req.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dsl_prop.h>
+#include <sys/dkio.h>
+#include <sys/efi_partition.h>
+#include <sys/byteorder.h>
+#include <sys/pathname.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/crc32.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mkdev.h>
+
+#include "zfs_namecheck.h"
+
+#define ZVOL_OBJ 1ULL
+#define ZVOL_ZAP_OBJ 2ULL
+#define ZVOL_MAX_MINOR MAXMIN32
+
+static void *zvol_state;
+
+/*
+ * This lock protects the zvol_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes. It also protects temporary opens of the dataset so that,
+ * e.g., an open doesn't get a spurious EBUSY.
+ */
+static kmutex_t zvol_state_lock;
+static uint32_t zvol_minors;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+ char zv_name[MAXPATHLEN]; /* pool/dd name */
+ uint64_t zv_volsize; /* amount of space we advertise */
+ minor_t zv_minor; /* minor number */
+ uint8_t zv_min_bs; /* minimum addressable block shift */
+ uint8_t zv_readonly; /* hard readonly; like write-protect */
+ objset_t *zv_objset; /* objset handle */
+ uint32_t zv_mode; /* DS_MODE_* flags at open time */
+ uint32_t zv_open_count[OTYPCNT]; /* open counts */
+ uint32_t zv_total_opens; /* total open count */
+} zvol_state_t;
+
+static void
+zvol_size_changed(zvol_state_t *zv, dev_t dev)
+{
+ dev = makedevice(getmajor(dev), zv->zv_minor);
+
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Size", zv->zv_volsize) == DDI_SUCCESS);
+ VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+ "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
+}
+
+int
+zvol_check_volsize(zfs_cmd_t *zc)
+{
+ if (zc->zc_volsize == 0)
+ return (EINVAL);
+
+ zc->zc_volsize = P2ROUNDUP(zc->zc_volsize, SPA_MAXBLOCKSIZE);
+#ifdef _ILP32
+ if (zc->zc_volsize - 1 > SPEC_MAXOFFSET_T)
+ return (EOVERFLOW);
+#endif
+ return (0);
+}
+
+int
+zvol_check_volblocksize(zfs_cmd_t *zc)
+{
+ if (zc->zc_volblocksize < SPA_MINBLOCKSIZE ||
+ zc->zc_volblocksize > SPA_MAXBLOCKSIZE ||
+ !ISP2(zc->zc_volblocksize))
+ return (EDOM);
+
+ return (0);
+}
+
+static void
+zvol_readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zvol_state_t *zv = arg;
+
+ zv->zv_readonly = (uint8_t)newval;
+}
+
+int
+zvol_get_stats(zfs_cmd_t *zc, objset_t *os)
+{
+ int error;
+ dmu_object_info_t doi;
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize);
+
+ if (error)
+ return (error);
+
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+ if (error == 0)
+ zc->zc_volblocksize = doi.doi_data_block_size;
+
+ return (error);
+}
+
+/*
+ * Find a free minor number.
+ */
+static minor_t
+zvol_minor_alloc(void)
+{
+ minor_t minor;
+
+ ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+ for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
+ if (ddi_get_soft_state(zvol_state, minor) == NULL)
+ return (minor);
+
+ return (0);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(char *name)
+{
+ minor_t minor;
+ zvol_state_t *zv;
+
+ ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+ for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+ zv = ddi_get_soft_state(zvol_state, minor);
+ if (zv == NULL)
+ continue;
+ if (strcmp(zv->zv_name, name) == 0)
+ break;
+ }
+
+ return (zv);
+}
+
+void
+zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_cmd_t *zc = arg;
+ int error;
+
+ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, zc->zc_volblocksize,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize, tx);
+ ASSERT(error == 0);
+}
+
+/*
+ * Create a minor node for the specified volume.
+ */
+int
+zvol_create_minor(zfs_cmd_t *zc)
+{
+ char *name = zc->zc_name;
+ dev_t dev = zc->zc_dev;
+ zvol_state_t *zv;
+ objset_t *os;
+ uint64_t volsize;
+ minor_t minor = 0;
+ struct pathname linkpath;
+ int ds_mode = DS_MODE_PRIMARY;
+ vnode_t *vp = NULL;
+ char *devpath;
+ size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
+ char chrbuf[30], blkbuf[30];
+ int error;
+
+ mutex_enter(&zvol_state_lock);
+
+ if ((zv = zvol_minor_lookup(name)) != NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (EEXIST);
+ }
+
+ if (strchr(name, '@') != 0)
+ ds_mode |= DS_MODE_READONLY;
+
+ error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+
+ if (error) {
+ mutex_exit(&zvol_state_lock);
+ return (error);
+ }
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+
+ if (error) {
+ dmu_objset_close(os);
+ mutex_exit(&zvol_state_lock);
+ return (error);
+ }
+
+ /*
+ * If there's an existing /dev/zvol symlink, try to use the
+ * same minor number we used last time.
+ */
+ devpath = kmem_alloc(devpathlen, KM_SLEEP);
+
+ (void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
+
+ error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
+
+ kmem_free(devpath, devpathlen);
+
+ if (error == 0 && vp->v_type != VLNK)
+ error = EINVAL;
+
+ if (error == 0) {
+ pn_alloc(&linkpath);
+ error = pn_getsymlink(vp, &linkpath, kcred);
+ if (error == 0) {
+ char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
+ if (ms != NULL) {
+ ms += strlen(ZVOL_PSEUDO_DEV);
+ minor = stoi(&ms);
+ }
+ }
+ pn_free(&linkpath);
+ }
+
+ if (vp != NULL)
+ VN_RELE(vp);
+
+ /*
+ * If we found a minor but it's already in use, we must pick a new one.
+ */
+ if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
+ minor = 0;
+
+ if (minor == 0)
+ minor = zvol_minor_alloc();
+
+ if (minor == 0) {
+ dmu_objset_close(os);
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
+ dmu_objset_close(os);
+ mutex_exit(&zvol_state_lock);
+ return (EAGAIN);
+ }
+
+ (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, name);
+
+ (void) sprintf(chrbuf, "%uc,raw", minor);
+
+ if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_soft_state_free(zvol_state, minor);
+ dmu_objset_close(os);
+ mutex_exit(&zvol_state_lock);
+ return (EAGAIN);
+ }
+
+ (void) sprintf(blkbuf, "%uc", minor);
+
+ if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+ minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(zfs_dip, chrbuf);
+ ddi_soft_state_free(zvol_state, minor);
+ dmu_objset_close(os);
+ mutex_exit(&zvol_state_lock);
+ return (EAGAIN);
+ }
+
+ zv = ddi_get_soft_state(zvol_state, minor);
+
+ (void) strcpy(zv->zv_name, name);
+ zv->zv_min_bs = DEV_BSHIFT;
+ zv->zv_minor = minor;
+ zv->zv_volsize = volsize;
+ zv->zv_objset = os;
+ zv->zv_mode = ds_mode;
+
+ zvol_size_changed(zv, dev);
+
+ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ zvol_minors++;
+
+ mutex_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+int
+zvol_remove_minor(zfs_cmd_t *zc)
+{
+ zvol_state_t *zv;
+ char namebuf[30];
+
+ mutex_enter(&zvol_state_lock);
+
+ if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ if (zv->zv_total_opens != 0) {
+ mutex_exit(&zvol_state_lock);
+ return (EBUSY);
+ }
+
+ (void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
+ ddi_remove_minor_node(zfs_dip, namebuf);
+
+ (void) sprintf(namebuf, "%uc", zv->zv_minor);
+ ddi_remove_minor_node(zfs_dip, namebuf);
+
+ VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ dmu_objset_close(zv->zv_objset);
+
+ zv->zv_objset = NULL;
+
+ ddi_soft_state_free(zvol_state, zv->zv_minor);
+
+ zvol_minors--;
+
+ mutex_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+int
+zvol_set_volsize(zfs_cmd_t *zc)
+{
+ zvol_state_t *zv;
+ dev_t dev = zc->zc_dev;
+ dmu_tx_t *tx;
+ int error;
+
+ if ((error = zvol_check_volsize(zc)) != 0)
+ return (error);
+
+ mutex_enter(&zvol_state_lock);
+
+ if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ mutex_exit(&zvol_state_lock);
+ return (EROFS);
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+ dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ mutex_exit(&zvol_state_lock);
+ return (error);
+ }
+
+ error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+ &zc->zc_volsize, tx);
+ if (error == 0)
+ dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+ DMU_OBJECT_END, tx);
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ zv->zv_volsize = zc->zc_volsize;
+ zvol_size_changed(zv, dev);
+ }
+
+ mutex_exit(&zvol_state_lock);
+
+ return (error);
+}
+
+int
+zvol_set_volblocksize(zfs_cmd_t *zc)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+
+ mutex_enter(&zvol_state_lock);
+
+ if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ mutex_exit(&zvol_state_lock);
+ return (EROFS);
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+ zc->zc_volblocksize, 0, tx);
+ if (error == ENOTSUP)
+ error = EBUSY;
+ dmu_tx_commit(tx);
+ }
+
+ mutex_exit(&zvol_state_lock);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+ minor_t minor = getminor(*devp);
+ zvol_state_t *zv;
+
+ if (minor == 0) /* This is the control device */
+ return (0);
+
+ mutex_enter(&zvol_state_lock);
+
+ zv = ddi_get_soft_state(zvol_state, minor);
+ if (zv == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ ASSERT(zv->zv_objset != NULL);
+
+ if ((flag & FWRITE) &&
+ (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
+ mutex_exit(&zvol_state_lock);
+ return (EROFS);
+ }
+
+ if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+ zv->zv_open_count[otyp]++;
+ zv->zv_total_opens++;
+ }
+
+ mutex_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+
+ if (minor == 0) /* This is the control device */
+ return (0);
+
+ mutex_enter(&zvol_state_lock);
+
+ zv = ddi_get_soft_state(zvol_state, minor);
+ if (zv == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ /*
+ * The next statement is a workaround for the following DDI bug:
+ * 6343604 specfs race: multiple "last-close" of the same device
+ */
+ if (zv->zv_total_opens == 0) {
+ mutex_exit(&zvol_state_lock);
+ return (0);
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_open_count[otyp] != 0);
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count[otyp]--;
+ zv->zv_total_opens--;
+
+ mutex_exit(&zvol_state_lock);
+
+ return (0);
+}
+
+int
+zvol_strategy(buf_t *bp)
+{
+ zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
+ uint64_t off, volsize;
+ size_t size, resid;
+ char *addr;
+ int error = 0;
+
+ if (zv == NULL) {
+ bioerror(bp, ENXIO);
+ biodone(bp);
+ return (0);
+ }
+
+ if (getminor(bp->b_edev) == 0) {
+ bioerror(bp, EINVAL);
+ biodone(bp);
+ return (0);
+ }
+
+ if (zv->zv_readonly && !(bp->b_flags & B_READ)) {
+ bioerror(bp, EROFS);
+ biodone(bp);
+ return (0);
+ }
+
+ off = ldbtob(bp->b_blkno);
+ volsize = zv->zv_volsize;
+
+ ASSERT(zv->zv_objset != NULL);
+
+ bp_mapin(bp);
+ addr = bp->b_un.b_addr;
+ resid = bp->b_bcount;
+
+ while (resid != 0 && off < volsize) {
+
+ size = MIN(resid, 1UL << 20); /* cap at 1MB per tx */
+
+ if (size > volsize - off) /* don't write past the end */
+ size = volsize - off;
+
+ if (bp->b_flags & B_READ) {
+ error = dmu_read_canfail(zv->zv_objset, ZVOL_OBJ,
+ off, size, addr);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(zv->zv_objset, ZVOL_OBJ,
+ off, size, addr, tx);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error)
+ break;
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+
+ if ((bp->b_resid = resid) == bp->b_bcount)
+ bioerror(bp, off > volsize ? EINVAL : error);
+
+ biodone(bp);
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+ return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+ return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+ return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
+}
+
+/*ARGSUSED*/
+int
+zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+ return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
+}
+
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+ zvol_state_t *zv;
+ struct dk_cinfo dkc;
+ struct dk_minfo dkm;
+ dk_efi_t efi;
+ efi_gpt_t gpt;
+ efi_gpe_t gpe;
+ struct uuid uuid = EFI_RESERVED;
+ uint32_t crc;
+ int error = 0;
+
+ mutex_enter(&zvol_state_lock);
+
+ zv = ddi_get_soft_state(zvol_state, getminor(dev));
+
+ if (zv == NULL) {
+ mutex_exit(&zvol_state_lock);
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+
+ case DKIOCINFO:
+ bzero(&dkc, sizeof (dkc));
+ (void) strcpy(dkc.dki_cname, "zvol");
+ (void) strcpy(dkc.dki_dname, "zvol");
+ dkc.dki_ctype = DKC_UNKNOWN;
+ dkc.dki_maxtransfer = 1 << 15;
+ mutex_exit(&zvol_state_lock);
+ if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGMEDIAINFO:
+ bzero(&dkm, sizeof (dkm));
+ dkm.dki_lbsize = 1U << zv->zv_min_bs;
+ dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkm.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zvol_state_lock);
+ if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+ error = EFAULT;
+ return (error);
+
+ case DKIOCGETEFI:
+ if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
+ mutex_exit(&zvol_state_lock);
+ return (EFAULT);
+ }
+
+ bzero(&gpt, sizeof (gpt));
+ bzero(&gpe, sizeof (gpe));
+
+ efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
+
+ if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
+ mutex_exit(&zvol_state_lock);
+ return (EINVAL);
+ }
+
+ efi.dki_length = sizeof (gpt) + sizeof (gpe);
+
+ gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+ gpt.efi_gpt_Revision = LE_32(EFI_VERSION102);
+ gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+ gpt.efi_gpt_FirstUsableLBA = LE_64(0ULL);
+ gpt.efi_gpt_LastUsableLBA =
+ LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
+ gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+ gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
+
+ UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+ gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
+ gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
+
+ CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+ gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+
+ CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+ gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+
+ mutex_exit(&zvol_state_lock);
+ if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag) ||
+ ddi_copyout(&gpe, efi.dki_data + 1, sizeof (gpe), flag))
+ error = EFAULT;
+ return (error);
+
+ default:
+ error = ENOTSUP;
+ break;
+
+ }
+ mutex_exit(&zvol_state_lock);
+ return (error);
+}
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
+ mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zvol_fini(void)
+{
+ mutex_destroy(&zvol_state_lock);
+ ddi_soft_state_fini(&zvol_state);
+}