diff options
author | ahrens <none@none> | 2005-10-31 11:33:35 -0800 |
---|---|---|
committer | ahrens <none@none> | 2005-10-31 11:33:35 -0800 |
commit | fa9e4066f08beec538e775443c5be79dd423fcab (patch) | |
tree | 576d99665e57bb7cb70584431adb08c14d47e3ce /usr/src/uts/common/fs | |
parent | f1b64740276f67fc6914c1d855f2af601efe99ac (diff) | |
download | illumos-gate-fa9e4066f08beec538e775443c5be79dd423fcab.tar.gz |
PSARC 2002/240 ZFS
6338653 Integrate ZFS
PSARC 2004/652 - DKIOCFLUSH
5096886 Write caching disks need mechanism to flush cache to physical media
Diffstat (limited to 'usr/src/uts/common/fs')
118 files changed, 50463 insertions, 146 deletions
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_all.c b/usr/src/uts/common/fs/ctfs/ctfs_all.c index dd3eeb15b6..4933edd960 100644 --- a/usr/src/uts/common/fs/ctfs/ctfs_all.c +++ b/usr/src/uts/common/fs/ctfs/ctfs_all.c @@ -99,7 +99,7 @@ ctfs_adir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop) if (*nm != '\0') return (ENOENT); - ct = contract_ptr(i, VTOZ(vp)->zone_uniqid); + ct = contract_ptr(i, VTOZONE(vp)->zone_uniqid); if (ct == NULL) return (ENOENT); @@ -118,7 +118,7 @@ ctfs_adir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp, uint64_t zuniqid; ctid_t next; - zuniqid = VTOZ(vp)->zone_uniqid; + zuniqid = VTOZONE(vp)->zone_uniqid; next = contract_lookup(zuniqid, *offp); if (next == -1) { diff --git a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c index a13091826c..f4980d4a97 100644 --- a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c +++ b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c @@ -249,11 +249,11 @@ ctfs_stat_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, detail = STRUCT_FGET(st, ctst_detail); if (detail == CTD_COMMON) { mutex_enter(&ct->ct_lock); - contract_status_common(ct, VTOZ(vp), STRUCT_BUF(st), mdl); + contract_status_common(ct, VTOZONE(vp), STRUCT_BUF(st), mdl); mutex_exit(&ct->ct_lock); } else if (detail <= CTD_ALL) { VERIFY(nvlist_alloc(&foo, NV_UNIQUE_NAME, KM_SLEEP) == 0); - type->ct_type_ops->contop_status(ct, VTOZ(vp), detail, foo, + type->ct_type_ops->contop_status(ct, VTOZONE(vp), detail, foo, STRUCT_BUF(st), mdl); VERIFY(nvlist_pack(foo, &bufp, &len, NV_ENCODE_NATIVE, KM_SLEEP) == 0); diff --git a/usr/src/uts/common/fs/ctfs/ctfs_event.c b/usr/src/uts/common/fs/ctfs/ctfs_event.c index afb08a7cfc..7fa7cfb608 100644 --- a/usr/src/uts/common/fs/ctfs/ctfs_event.c +++ b/usr/src/uts/common/fs/ctfs/ctfs_event.c @@ -287,7 +287,7 @@ ctfs_ev_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, ctfs_evnode_t *evnode = vp->v_data; return (ctfs_endpoint_ioctl(&evnode->ctfs_ev_listener, cmd, arg, cr, - VTOZ(vp), 0)); + VTOZONE(vp), 0)); } /* @@ -430,7 +430,7 @@ ctfs_bu_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, ctfs_bunode_t *bunode = vp->v_data; return (ctfs_endpoint_ioctl(&bunode->ctfs_bu_listener, cmd, arg, cr, - VTOZ(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE)); + VTOZONE(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE)); } /* diff --git a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c index 479f64b064..1f5dd42370 100644 --- a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c +++ b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c @@ -108,7 +108,7 @@ ctfs_tdir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp, ctid_t next; ct_type_t *ty = ct_types[gfs_file_index(vp)]; - zuniqid = VTOZ(vp)->zone_uniqid; + zuniqid = VTOZONE(vp)->zone_uniqid; next = contract_type_lookup(ty, zuniqid, *offp); if (next == -1) { @@ -135,7 +135,7 @@ ctfs_tdir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop) return (ENOENT); ct = contract_type_ptr(ct_types[gfs_file_index(vp)], i, - VTOZ(vp)->zone_uniqid); + VTOZONE(vp)->zone_uniqid); if (ct == NULL) return (ENOENT); diff --git a/usr/src/uts/common/fs/devfs/devfs_subr.c b/usr/src/uts/common/fs/devfs/devfs_subr.c index 0f53a24ca0..864ed2ad60 100644 --- a/usr/src/uts/common/fs/devfs/devfs_subr.c +++ b/usr/src/uts/common/fs/devfs/devfs_subr.c @@ -569,20 +569,6 @@ dv_vattr_merge(struct dv_node *dv, struct vattr *vap) } /* - * Free a vsecattr - */ -static void -dv_free_vsa(struct vsecattr *vsap) -{ - if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp) - kmem_free(vsap->vsa_aclentp, - vsap->vsa_aclcnt * sizeof (aclent_t)); - if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp) - kmem_free(vsap->vsa_dfaclentp, - vsap->vsa_dfaclcnt * sizeof (aclent_t)); -} - -/* * dv_shadow_node * * Given a VDIR dv_node, find/create the associated VDIR @@ -623,7 +609,6 @@ dv_shadow_node( int create_tried; int error; mperm_t mp; - struct vsecattr vsa; ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK); dv = VTODV(vp); @@ -678,19 +663,14 @@ lookup: dv->dv_attrvp = rvp; /* with one hold */ /* - * Determine if we have (non-trivial) ACLs on this node. - * NB: This should be changed call fs_acl_nontrivial for - * new ACE flavor ACLs. + * Determine if we have non-trivial ACLs on this node. + * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial + * only does VOP_GETSECATTR. */ - vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT; - error = VOP_GETSECATTR(rvp, &vsa, 0, cred); dv->dv_flags &= ~DV_ACL; - if (error == 0) { - if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) { - dv->dv_flags |= DV_ACL; /* non-trivial ACL */ - } - dv_free_vsa(&vsa); - } + + if (fs_acl_nontrivial(rvp, cred)) + dv->dv_flags |= DV_ACL; /* * If we have synced out the memory attributes, free diff --git a/usr/src/uts/common/fs/devfs/devfs_vnops.c b/usr/src/uts/common/fs/devfs/devfs_vnops.c index 7a3d4c1c04..b8dfce5448 100644 --- a/usr/src/uts/common/fs/devfs/devfs_vnops.c +++ b/usr/src/uts/common/fs/devfs/devfs_vnops.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -621,7 +621,6 @@ devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags, error = VOP_GETSECATTR(avp, vsap, flags, cr); dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error)); - rw_exit(&dv->dv_contents); return (error); } @@ -678,10 +677,11 @@ devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags, VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL); /* - * NB: This code should call fs_acl_nontrivial when available so that - * DV_ACL is only set on nontrivial ACLs. + * Set DV_ACL if we have a non-trivial set of ACLs. It is not + * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does + * VOP_GETSECATTR calls. */ - if (error == 0) + if (fs_acl_nontrivial(avp, cr)) dv->dv_flags |= DV_ACL; return (error); } diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c index 7fc9dc4277..3466db3832 100644 --- a/usr/src/uts/common/fs/fs_subr.c +++ b/usr/src/uts/common/fs/fs_subr.c @@ -24,7 +24,7 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,6 +57,7 @@ #include <sys/kmem.h> #include <sys/file.h> #include <sys/nbmlock.h> +#include <acl/acl_common.h> static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *); @@ -632,3 +633,84 @@ fs_vnevent_support(vnode_t *vp, vnevent_t vnevent) ASSERT(vp != NULL); return (0); } + +/* + * return 1 for non-trivial ACL. + * + * NB: It is not necessary for the caller to VOP_RWLOCK since + * we only issue VOP_GETSECATTR. + * + * Returns 0 == trivial + * 1 == NOT Trivial + * <0 could not determine. + */ +int +fs_acl_nontrivial(vnode_t *vp, cred_t *cr) +{ + ulong_t acl_styles; + ulong_t acl_flavor; + vsecattr_t vsecattr; + int error; + int isnontrivial; + + /* determine the forms of ACLs maintained */ + error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr); + + /* clear bits we don't understand and establish default acl_style */ + acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED); + if (error || (acl_styles == 0)) + acl_styles = _ACL_ACLENT_ENABLED; + + vsecattr.vsa_aclentp = NULL; + vsecattr.vsa_dfaclentp = NULL; + vsecattr.vsa_aclcnt = 0; + vsecattr.vsa_dfaclcnt = 0; + + while (acl_styles) { + /* select one of the styles as current flavor */ + acl_flavor = 0; + if (acl_styles & _ACL_ACLENT_ENABLED) { + acl_flavor = _ACL_ACLENT_ENABLED; + vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT; + } else if (acl_styles & _ACL_ACE_ENABLED) { + acl_flavor = _ACL_ACE_ENABLED; + vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE; + } + + ASSERT(vsecattr.vsa_mask && acl_flavor); + error = VOP_GETSECATTR(vp, &vsecattr, 0, cr); + if (error == 0) + break; + + /* that flavor failed */ + acl_styles &= ~acl_flavor; + } + + /* if all styles fail then assume trivial */ + if (acl_styles == 0) + return (0); + + /* process the flavor that worked */ + isnontrivial = 0; + if (acl_flavor & _ACL_ACLENT_ENABLED) { + if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES) + isnontrivial = 1; + if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt * sizeof (aclent_t)); + if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL) + kmem_free(vsecattr.vsa_dfaclentp, + vsecattr.vsa_dfaclcnt * sizeof (aclent_t)); + } + if (acl_flavor & _ACL_ACE_ENABLED) { + + isnontrivial = ace_trivial(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt); + + if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt * sizeof (ace_t)); + /* ACE has no vsecattr.vsa_dfaclcnt */ + } + return (isnontrivial); +} diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h index 27fc845718..8cd453edba 100644 --- a/usr/src/uts/common/fs/fs_subr.h +++ b/usr/src/uts/common/fs/fs_subr.h @@ -23,7 +23,7 @@ /* All Rights Reserved */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -81,6 +81,7 @@ extern int fs_shrlock(struct vnode *, int, struct shrlock *, int, cred_t *); extern int fs_vnevent_nosupport(vnode_t *, vnevent_t); extern int fs_vnevent_support(vnode_t *, vnevent_t); +extern int fs_acl_nontrivial(struct vnode *vp, struct cred *cr); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 7fd7f66510..b7fdf996e2 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -789,7 +789,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, size_t dbuflen; struct iovec iov; struct uio uio; - int err; + int error; int eof; vnode_t *cmpvp; struct dirent64 *dp; @@ -811,8 +811,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = 0; - if ((err = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0) - return (err); + if ((error = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0) + return (error); while (!eof) { uio.uio_resid = dlen; @@ -820,12 +820,12 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, iov.iov_len = dlen; (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); - err = VOP_READDIR(dvp, &uio, cr, &eof); + error = VOP_READDIR(dvp, &uio, cr, &eof); VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); dbuflen = dlen - uio.uio_resid; - if (err || dbuflen == 0) + if (error || dbuflen == 0) break; dp = (dirent64_t *)dbuf; @@ -840,7 +840,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, continue; } - err = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0, + error = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0, vrootp, cr); /* @@ -849,7 +849,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, * just removed an entry since the readdir() call, and * the entry we want is further on in the directory. */ - if (err == 0) { + if (error == 0) { if (vnode_match(tvp, cmpvp, cr)) { VN_RELE(cmpvp); *rdp = dp; @@ -857,8 +857,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, } VN_RELE(cmpvp); - } else if (err != ENOENT) { - return (err); + } else if (error != ENOENT) { + return (error); } dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); @@ -868,13 +868,26 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf, /* * Something strange has happened, this directory does not contain the * specified vnode. This should never happen in the normal case, since - * we ensured that dvp is the parent of vp. This may be possible in - * some race conditions, so fail gracefully. + * we ensured that dvp is the parent of vp. This is possible in some + * rare conditions (races and the special .zfs directory). */ - if (err == 0) - err = ENOENT; + if (error == 0) { + error = VOP_LOOKUP(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr); + if (error == 0) { + if (vnode_match(tvp, cmpvp, cr)) { + (void) strcpy(dp->d_name, ".zfs"); + dp->d_reclen = strlen(".zfs"); + dp->d_off = 2; + dp->d_ino = 1; + *rdp = dp; + } else { + error = ENOENT; + } + VN_RELE(cmpvp); + } + } - return (err); + return (error); } /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_acl.c b/usr/src/uts/common/fs/nfs/nfs4_acl.c index 9b584f6256..96aa1756e9 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_acl.c +++ b/usr/src/uts/common/fs/nfs/nfs4_acl.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -80,10 +80,15 @@ static int ace4_list_to_aent(ace4_list_t *, aclent_t **, int *, uid_t, gid_t, static int ln_ace4_to_aent(nfsace4 *ace4, int n, uid_t, gid_t, aclent_t **, int *, aclent_t **, int *, int, int, int); static int ace4_cmp(nfsace4 *, nfsace4 *); -static int acet_to_ace4(ace_t *, nfsace4 *, int, int); -static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int, int); +static int acet_to_ace4(ace_t *, nfsace4 *, int); +static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int); static int validate_idmapping(utf8string *, uid_t, int, int, int); static int u8s_mapped_to_nobody(utf8string *, uid_t, int); +static void ace4_mask_to_acet_mask(acemask4, uint32_t *); +static void acet_mask_to_ace4_mask(uint32_t, acemask4 *); +static void ace4_flags_to_acet_flags(aceflag4, uint16_t *); +static void acet_flags_to_ace4_flags(uint16_t, aceflag4 *); + /* * The following two functions check and set ACE4_SYNCRONIZE, ACE4_WRITE_OWNER, * ACE4_DELETE and ACE4_WRITE_ATTRIBUTES. @@ -1651,7 +1656,7 @@ ln_ace4_cmp(nfsace4 *a, nfsace4* b, int n) * strings versus integer uid/gids. */ static int -acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver) +acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isserver) { int error = 0; @@ -1669,44 +1674,45 @@ acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver) } switch (ace->a_type) { - case ALLOW: + case ACE_ACCESS_ALLOWED_ACE_TYPE: nfsace4->type = ACE4_ACCESS_ALLOWED_ACE_TYPE; break; - case DENY: + case ACE_ACCESS_DENIED_ACE_TYPE: nfsace4->type = ACE4_ACCESS_DENIED_ACE_TYPE; break; default: + NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, + "acet_to_ace4: unsupported type: %x", ace->a_type)); error = ENOTSUP; break; } if (error != 0) goto out; - nfsace4->access_mask = mode_to_ace4_access(ace->a_access_mask, - isdir, ace->a_flags & ACE_OWNER, ace->a_type == ALLOW, isserver); + acet_mask_to_ace4_mask(ace->a_access_mask, &nfsace4->access_mask); + acet_flags_to_ace4_flags(ace->a_flags, &nfsace4->flag); - nfsace4->flag = (ace->a_flags & ACE_NFSV4_SUP_FLAGS); - if (ace->a_flags & ACE_GROUPS) { + if (ace->a_flags & ACE_GROUP) { + nfsace4->flag |= ACE4_IDENTIFIER_GROUP; + (void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who); + } else if (ace->a_flags & ACE_IDENTIFIER_GROUP) { nfsace4->flag |= ACE4_IDENTIFIER_GROUP; error = nfs_idmap_gid_str(ace->a_who, &nfsace4->who, isserver); - } else if (ace->a_flags & ACE_USER) { - error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver); + if (error != 0) + NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, + "acet_to_ace4: idmap failed with %d", error)); } else if (ace->a_flags & ACE_OWNER) { (void) str_to_utf8(ACE4_WHO_OWNER, &nfsace4->who); - } else if (ace->a_flags & ACE_GROUP) { - nfsace4->flag |= ACE4_IDENTIFIER_GROUP; - (void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who); - } else if (ace->a_flags & ACE_OTHER) { + } else if (ace->a_flags & ACE_EVERYONE) { (void) str_to_utf8(ACE4_WHO_EVERYONE, &nfsace4->who); + } else { + error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver); + if (error != 0) + NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, + "acet_to_ace4: idmap failed with %d", error)); } out: -#ifdef DEBUG - if (error != 0) - NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, - "acet_to_ace4: idmap failed with %d", error)); -#endif - return (error); } @@ -1716,10 +1722,9 @@ out: */ static int ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, - int isdir, int isserver, int just_count) + int isserver, int just_count) { int error = 0; - o_mode_t mode; if (nfsace4 == NULL) { NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, @@ -1734,12 +1739,14 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, switch (nfsace4->type) { case ACE4_ACCESS_ALLOWED_ACE_TYPE: - ace->a_type = ALLOW; + ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE; break; case ACE4_ACCESS_DENIED_ACE_TYPE: - ace->a_type = DENY; + ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE; break; default: + NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, + "ace4_to_acet: unsupported type: %x", nfsace4->type)); error = ENOTSUP; break; } @@ -1761,16 +1768,15 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, goto out; } - ace->a_access_mask = nfsace4->access_mask; - error = ace4_mask_to_mode(nfsace4->access_mask, &mode, isdir); - if (error != 0) - goto out; - ace->a_access_mask = mode; - if (nfsace4->flag & ~(ACE_NFSV4_SUP_FLAGS | ACE4_IDENTIFIER_GROUP)) { + ace4_mask_to_acet_mask(nfsace4->access_mask, &ace->a_access_mask); + + if (nfsace4->flag & ~ACE_NFSV4_SUP_FLAGS) { + NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, + "ace4_to_acet: unsupported flags: %x", nfsace4->flag)); error = ENOTSUP; goto out; } - ace->a_flags = (nfsace4->flag & ACE_NFSV4_SUP_FLAGS); + ace4_flags_to_acet_flags(nfsace4->flag, &ace->a_flags); if (nfsace4->flag & ACE4_IDENTIFIER_GROUP) { if ((nfsace4->who.utf8string_len == 6) && @@ -1780,7 +1786,7 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, ace->a_flags |= ACE_GROUP; error = 0; } else { - ace->a_flags |= ACE_GROUPS; + ace->a_flags |= ACE_IDENTIFIER_GROUP; error = nfs_idmap_str_gid(&nfsace4->who, &ace->a_who, isserver); if (error != 0) { @@ -1807,10 +1813,9 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, } else if ((nfsace4->who.utf8string_len == 9) && (bcmp(ACE4_WHO_EVERYONE, nfsace4->who.utf8string_val, 9) == 0)) { - ace->a_flags |= ACE_OTHER; + ace->a_flags |= ACE_EVERYONE; ace->a_who = 0; } else { - ace->a_flags |= ACE_USER; error = nfs_idmap_str_uid(&nfsace4->who, &ace->a_who, isserver); if (error != 0) { @@ -1830,18 +1835,124 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group, } out: -#ifdef DEBUG - if (error != 0) - NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE, - "ace4_to_acet: idmap failed with %d", error)); -#endif - return (error); } +static void +ace4_mask_to_acet_mask(acemask4 ace4_mask, uint32_t *acet_mask) +{ + *acet_mask = 0; + + if (ace4_mask & ACE4_READ_DATA) + *acet_mask |= ACE_READ_DATA; + if (ace4_mask & ACE4_WRITE_DATA) + *acet_mask |= ACE_WRITE_DATA; + if (ace4_mask & ACE4_APPEND_DATA) + *acet_mask |= ACE_APPEND_DATA; + if (ace4_mask & ACE4_READ_NAMED_ATTRS) + *acet_mask |= ACE_READ_NAMED_ATTRS; + if (ace4_mask & ACE4_WRITE_NAMED_ATTRS) + *acet_mask |= ACE_WRITE_NAMED_ATTRS; + if (ace4_mask & ACE4_EXECUTE) + *acet_mask |= ACE_EXECUTE; + if (ace4_mask & ACE4_DELETE_CHILD) + *acet_mask |= ACE_DELETE_CHILD; + if (ace4_mask & ACE4_READ_ATTRIBUTES) + *acet_mask |= ACE_READ_ATTRIBUTES; + if (ace4_mask & ACE4_WRITE_ATTRIBUTES) + *acet_mask |= ACE_WRITE_ATTRIBUTES; + if (ace4_mask & ACE4_DELETE) + *acet_mask |= ACE_DELETE; + if (ace4_mask & ACE4_READ_ACL) + *acet_mask |= ACE_READ_ACL; + if (ace4_mask & ACE4_WRITE_ACL) + *acet_mask |= ACE_WRITE_ACL; + if (ace4_mask & ACE4_WRITE_OWNER) + *acet_mask |= ACE_WRITE_OWNER; + if (ace4_mask & ACE4_SYNCHRONIZE) + *acet_mask |= ACE_SYNCHRONIZE; +} + +static void +acet_mask_to_ace4_mask(uint32_t acet_mask, acemask4 *ace4_mask) +{ + *ace4_mask = 0; + + if (acet_mask & ACE_READ_DATA) + *ace4_mask |= ACE4_READ_DATA; + if (acet_mask & ACE_WRITE_DATA) + *ace4_mask |= ACE4_WRITE_DATA; + if (acet_mask & ACE_APPEND_DATA) + *ace4_mask |= ACE_APPEND_DATA; + if (acet_mask & ACE4_READ_NAMED_ATTRS) + *ace4_mask |= ACE_READ_NAMED_ATTRS; + if (acet_mask & ACE_WRITE_NAMED_ATTRS) + *ace4_mask |= ACE4_WRITE_NAMED_ATTRS; + if (acet_mask & ACE_EXECUTE) + *ace4_mask |= ACE4_EXECUTE; + if (acet_mask & ACE_DELETE_CHILD) + *ace4_mask |= ACE4_DELETE_CHILD; + if (acet_mask & ACE_READ_ATTRIBUTES) + *ace4_mask |= ACE4_READ_ATTRIBUTES; + if (acet_mask & ACE_WRITE_ATTRIBUTES) + *ace4_mask |= ACE4_WRITE_ATTRIBUTES; + if (acet_mask & ACE_DELETE) + *ace4_mask |= ACE4_DELETE; + if (acet_mask & ACE_READ_ACL) + *ace4_mask |= ACE4_READ_ACL; + if (acet_mask & ACE_WRITE_ACL) + *ace4_mask |= ACE4_WRITE_ACL; + if (acet_mask & ACE_WRITE_OWNER) + *ace4_mask |= ACE4_WRITE_OWNER; + if (acet_mask & ACE_SYNCHRONIZE) + *ace4_mask |= ACE4_SYNCHRONIZE; +} + +static void +ace4_flags_to_acet_flags(aceflag4 ace4_flags, uint16_t *acet_flags) +{ + *acet_flags = 0; + + if (ace4_flags & ACE4_FILE_INHERIT_ACE) + *acet_flags |= ACE_FILE_INHERIT_ACE; + if (ace4_flags & ACE4_DIRECTORY_INHERIT_ACE) + *acet_flags |= ACE_DIRECTORY_INHERIT_ACE; + if (ace4_flags & ACE4_NO_PROPAGATE_INHERIT_ACE) + *acet_flags |= ACE_NO_PROPAGATE_INHERIT_ACE; + if (ace4_flags & ACE4_INHERIT_ONLY_ACE) + *acet_flags |= ACE_INHERIT_ONLY_ACE; + if (ace4_flags & ACE4_SUCCESSFUL_ACCESS_ACE_FLAG) + *acet_flags |= ACE_SUCCESSFUL_ACCESS_ACE_FLAG; + if (ace4_flags & ACE4_FAILED_ACCESS_ACE_FLAG) + *acet_flags |= ACE_FAILED_ACCESS_ACE_FLAG; + if (ace4_flags & ACE4_IDENTIFIER_GROUP) + *acet_flags |= ACE_IDENTIFIER_GROUP; +} + +static void +acet_flags_to_ace4_flags(uint16_t acet_flags, aceflag4 *ace4_flags) +{ + *ace4_flags = 0; + + if (acet_flags & ACE_FILE_INHERIT_ACE) + *ace4_flags |= ACE4_FILE_INHERIT_ACE; + if (acet_flags & ACE_DIRECTORY_INHERIT_ACE) + *ace4_flags |= ACE4_DIRECTORY_INHERIT_ACE; + if (acet_flags & ACE_NO_PROPAGATE_INHERIT_ACE) + *ace4_flags |= ACE4_NO_PROPAGATE_INHERIT_ACE; + if (acet_flags & ACE_INHERIT_ONLY_ACE) + *ace4_flags |= ACE4_INHERIT_ONLY_ACE; + if (acet_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG) + *ace4_flags |= ACE4_SUCCESSFUL_ACCESS_ACE_FLAG; + if (acet_flags & ACE_FAILED_ACCESS_ACE_FLAG) + *ace4_flags |= ACE4_FAILED_ACCESS_ACE_FLAG; + if (acet_flags & ACE_IDENTIFIER_GROUP) + *ace4_flags |= ACE4_IDENTIFIER_GROUP; +} + int vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet, - uid_t owner, gid_t group, int isdir, int isserver, int just_count) + uid_t owner, gid_t group, int isserver, int just_count) { int error; int i; @@ -1865,7 +1976,7 @@ vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet, for (i = 0; i < vs_ace4->vsa_aclcnt; i++) { error = ace4_to_acet((nfsace4 *)(vs_ace4->vsa_aclentp) + i, (ace_t *)(vs_acet->vsa_aclentp) + i, owner, group, - isdir, isserver, just_count); + isserver, just_count); if (error != 0) goto out; } @@ -1879,7 +1990,7 @@ out: int vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4, - int isdir, int isserver) + int isserver) { int error = 0; int i; @@ -1900,7 +2011,7 @@ vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4, for (i = 0; i < vs_acet->vsa_aclcnt; i++) { error = acet_to_ace4((ace_t *)(vs_acet->vsa_aclentp) + i, - (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isdir, isserver); + (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isserver); if (error != 0) goto out; } diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c index 6ef0000ea3..6169621a73 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c @@ -887,8 +887,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, if (error != 0) break; if (whichacl & _ACL_ACE_ENABLED) { - error = vs_acet_to_ace4(&vs_native, &vs_ace4, - vp->v_type == VDIR, TRUE); + error = vs_acet_to_ace4(&vs_native, &vs_ace4, TRUE); vs_acet_destroy(&vs_native); } else { error = vs_aent_to_ace4(&vs_native, &vs_ace4, @@ -968,8 +967,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, if (whichacl & _ACL_ACE_ENABLED) { error = vs_ace4_to_acet(&vs_ace4, &vs_native, - vap->va_uid, vap->va_gid, vp->v_type == VDIR, TRUE, - FALSE); + vap->va_uid, vap->va_gid, TRUE, FALSE); if (error != 0) break; (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index d07cedb514..9ae1d0a56c 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -11982,7 +11982,7 @@ nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) * These are ace_t type entries. */ error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, - vp->v_type == VDIR, FALSE); + FALSE); if (error) return (error); } @@ -12151,7 +12151,7 @@ nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, if (orig_mask & (VSA_ACE | VSA_ACECNT)) { error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, - isdir, FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); + FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); if (error) return (error); diff --git a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c index 836297350a..1242f94e10 100644 --- a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. + * Copyright 2005 Sun Microsystems, Inc. * All rights reserved. * Use is subject to license terms. */ @@ -68,6 +68,8 @@ #include <nfs/nfs_clnt.h> #include <nfs/nfs_acl.h> +#include <fs/fs_subr.h> + /* * These are the interface routines for the server side of the * NFS ACL server. See the NFS ACL protocol specification @@ -95,6 +97,25 @@ acl2_getacl(GETACL2args *args, GETACL2res *resp, struct exportinfo *exi, error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr); + if (error == ENOSYS) { + /* + * If the underlying file system doesn't support + * aclent_t type acls, fabricate an acl. This is + * required in order to to support existing clients + * that require the call to VOP_GETSECATTR to + * succeed while making the assumption that all + * file systems support aclent_t type acls. This + * causes problems for servers exporting ZFS file + * systems because ZFS supports ace_t type acls, + * and fails (with ENOSYS) when asked for aclent_t + * type acls. + * + * Note: if the fs_fab_acl() fails, we have other problems. + * This error should be returned to the caller. + */ + error = fs_fab_acl(vp, &resp->resok.acl, 0, cr); + } + if (error) { VN_RELE(vp); resp->status = puterrno(error); @@ -454,6 +475,25 @@ acl3_getacl(GETACL3args *args, GETACL3res *resp, struct exportinfo *exi, error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr); + if (error == ENOSYS) { + /* + * If the underlying file system doesn't support + * aclent_t type acls, fabricate an acl. This is + * required in order to to support existing clients + * that require the call to VOP_GETSECATTR to + * succeed while making the assumption that all + * file systems support aclent_t type acls. This + * causes problems for servers exporting ZFS file + * systems because ZFS supports ace_t type acls, + * and fails (with ENOSYS) when asked for aclent_t + * type acls. + * + * Note: if the fs_fab_acl() fails, we have other problems. + * This error should be returned to the caller. + */ + error = fs_fab_acl(vp, &resp->resok.acl, 0, cr); + } + if (error) goto out; diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c index 79f486e9b1..844a3b7bb1 100644 --- a/usr/src/uts/common/fs/proc/prioctl.c +++ b/usr/src/uts/common/fs/proc/prioctl.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -498,7 +498,7 @@ startover: */ t = pr_thread(pnp); /* returns locked thread */ thread_unlock(t); - oprgetstatus(t, &un.prstat, VTOZ(vp)); + oprgetstatus(t, &un.prstat, VTOZONE(vp)); prunlock(pnp); if (copyout(&un.prstat, cmaddr, sizeof (un.prstat))) error = EFAULT; @@ -835,7 +835,7 @@ startover: break; case PIOCSTATUS: /* get process/lwp status */ - oprgetstatus(t, &un.prstat, VTOZ(vp)); + oprgetstatus(t, &un.prstat, VTOZONE(vp)); prunlock(pnp); if (copyout(&un.prstat, cmaddr, sizeof (un.prstat))) error = EFAULT; @@ -866,13 +866,13 @@ startover: Bprsp = thing; thing = NULL; prsp = Bprsp; - oprgetstatus(t, prsp, VTOZ(vp)); + oprgetstatus(t, prsp, VTOZONE(vp)); t = p->p_tlist; do { ASSERT(!(t->t_proc_flag & TP_LWPEXIT)); ASSERT(nlwp > 0); --nlwp; - oprgetstatus(t, ++prsp, VTOZ(vp)); + oprgetstatus(t, ++prsp, VTOZONE(vp)); } while ((t = t->t_forw) != p->p_tlist); ASSERT(nlwp == 0); prunlock(pnp); @@ -2053,7 +2053,7 @@ startover: */ t = pr_thread(pnp); /* returns locked thread */ thread_unlock(t); - oprgetstatus32(t, &un32.prstat, VTOZ(vp)); + oprgetstatus32(t, &un32.prstat, VTOZONE(vp)); prunlock(pnp); if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat))) error = EFAULT; @@ -2430,7 +2430,7 @@ startover: error = EOVERFLOW; break; } - oprgetstatus32(t, &un32.prstat, VTOZ(vp)); + oprgetstatus32(t, &un32.prstat, VTOZONE(vp)); prunlock(pnp); if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat))) error = EFAULT; @@ -2471,13 +2471,13 @@ startover: Bprsp = (prstatus32_t *)thing; thing = NULL; prsp = Bprsp; - oprgetstatus32(t, prsp, VTOZ(vp)); + oprgetstatus32(t, prsp, VTOZONE(vp)); t = p->p_tlist; do { ASSERT(!(t->t_proc_flag & TP_LWPEXIT)); ASSERT(nlwp > 0); --nlwp; - oprgetstatus32(t, ++prsp, VTOZ(vp)); + oprgetstatus32(t, ++prsp, VTOZONE(vp)); } while ((t = t->t_forw) != p->p_tlist); ASSERT(nlwp == 0); prunlock(pnp); diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index dea54056c6..d12ee64e8c 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -709,7 +709,7 @@ pr_read_status(prnode_t *pnp, uio_t *uiop) */ sp = kmem_alloc(sizeof (*sp), KM_SLEEP); if ((error = prlock(pnp, ZNO)) == 0) { - prgetstatus(pnp->pr_common->prc_proc, sp, VTOZ(PTOV(pnp))); + prgetstatus(pnp->pr_common->prc_proc, sp, VTOZONE(PTOV(pnp))); prunlock(pnp); error = pr_uioread(sp, sizeof (*sp), uiop); } @@ -753,7 +753,7 @@ pr_read_lstatus(prnode_t *pnp, uio_t *uiop) if (ldp->ld_entry == NULL || (t = ldp->ld_entry->le_thread) == NULL) continue; - prgetlwpstatus(t, sp, VTOZ(PTOV(pnp))); + prgetlwpstatus(t, sp, VTOZONE(PTOV(pnp))); sp = (lwpstatus_t *)((caddr_t)sp + LSPAN(lwpstatus_t)); } prunlock(pnp); @@ -1426,7 +1426,7 @@ pr_read_lwpstatus(prnode_t *pnp, uio_t *uiop) goto out; } - prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp))); + prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp))); prunlock(pnp); error = pr_uioread(sp, sizeof (*sp), uiop); @@ -1799,7 +1799,7 @@ pr_read_status_32(prnode_t *pnp, uio_t *uiop) error = EOVERFLOW; } else { prgetstatus32(pnp->pr_common->prc_proc, sp, - VTOZ(PTOV(pnp))); + VTOZONE(PTOV(pnp))); prunlock(pnp); error = pr_uioread(sp, sizeof (*sp), uiop); } @@ -1852,7 +1852,7 @@ pr_read_lstatus_32(prnode_t *pnp, uio_t *uiop) if (ldp->ld_entry == NULL || (t = ldp->ld_entry->le_thread) == NULL) continue; - prgetlwpstatus32(t, sp, VTOZ(PTOV(pnp))); + prgetlwpstatus32(t, sp, VTOZONE(PTOV(pnp))); sp = (lwpstatus32_t *)((caddr_t)sp + LSPAN32(lwpstatus32_t)); } prunlock(pnp); @@ -2471,7 +2471,7 @@ pr_read_lwpstatus_32(prnode_t *pnp, uio_t *uiop) goto out; } - prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp))); + prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp))); prunlock(pnp); error = pr_uioread(sp, sizeof (*sp), uiop); @@ -4281,9 +4281,9 @@ pr_lookup_ctdir(vnode_t *dp, char *comp) * outside the zone. (see logic in contract_status_common) */ if ((ct->ct_owner != p) && - !(p == VTOZ(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN && - VTOZ(dp)->zone_uniqid == contract_getzuniqid(ct) && - VTOZ(dp)->zone_uniqid != GLOBAL_ZONEUNIQID && + !(p == VTOZONE(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN && + VTOZONE(dp)->zone_uniqid == contract_getzuniqid(ct) && + VTOZONE(dp)->zone_uniqid != GLOBAL_ZONEUNIQID && ct->ct_czuniqid == GLOBAL_ZONEUNIQID)) { prunlock(dpnp); prfreenode(pnp); @@ -4668,7 +4668,7 @@ pr_readdir_procdir(prnode_t *pnp, uio_t *uiop, int *eofp) ASSERT(pnp->pr_type == PR_PROCDIR); - zoneid = VTOZ(PTOV(pnp))->zone_id; + zoneid = VTOZONE(PTOV(pnp))->zone_id; if ((error = gfs_readdir_init(&gstate, PNSIZ, PRSDSIZE, uiop, PRROOTINO, PRROOTINO)) != 0) @@ -5453,7 +5453,7 @@ pr_readdir_ctdir(prnode_t *pnp, uio_t *uiop, int *eofp) return (error); } - zid = VTOZ(pnp->pr_vnode)->zone_uniqid; + zid = VTOZONE(pnp->pr_vnode)->zone_uniqid; while ((error = gfs_readdir_pred(&gstate, uiop, &n)) == 0) { id_t next = contract_plookup(p, n, zid); if (next == -1) { diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 1e7793ba39..4d562852af 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -720,28 +720,37 @@ top: vsec.vsa_dfaclcnt = 0; vsec.vsa_dfaclentp = NULL; vsec.vsa_mask = VSA_DFACLCNT; - if (error = VOP_GETSECATTR(dvp, &vsec, 0, CRED())) { + error = VOP_GETSECATTR(dvp, &vsec, 0, CRED()); + /* + * If error is ENOSYS then treat it as no error + * Don't want to force all file systems to support + * aclent_t style of ACL's. + */ + if (error == ENOSYS) + error = 0; + if (error) { if (*vpp != NULL) VN_RELE(*vpp); goto out; - } - - /* - * Apply the umask if no default ACLs. - */ - if (vsec.vsa_dfaclcnt == 0) - vap->va_mode &= ~umask; + } else { + /* + * Apply the umask if no default ACLs. + */ + if (vsec.vsa_dfaclcnt == 0) + vap->va_mode &= ~umask; - /* - * VOP_GETSECATTR() may have allocated memory for ACLs we - * didn't request, so double-check and free it if necessary. - */ - if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) - kmem_free((caddr_t)vsec.vsa_aclentp, - vsec.vsa_aclcnt * sizeof (aclent_t)); - if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) - kmem_free((caddr_t)vsec.vsa_dfaclentp, - vsec.vsa_dfaclcnt * sizeof (aclent_t)); + /* + * VOP_GETSECATTR() may have allocated memory for + * ACLs we didn't request, so double-check and + * free it if necessary. + */ + if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) + kmem_free((caddr_t)vsec.vsa_aclentp, + vsec.vsa_aclcnt * sizeof (aclent_t)); + if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) + kmem_free((caddr_t)vsec.vsa_dfaclentp, + vsec.vsa_dfaclcnt * sizeof (aclent_t)); + } } /* diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c new file mode 100644 index 0000000000..0a6cc7b658 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -0,0 +1,1998 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * DVA-based Adjustable Relpacement Cache + * + * While much of the theory of operation and algorithms used here + * are based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slowes the flow of new data + * into the cache until we can make space avaiable. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory preasure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() inerface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the "top" state mutex must be held before the "bot" state mutex. + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <sys/vmsystm.h> +#include <vm/anon.h> +#include <sys/fs/swapnode.h> +#endif +#include <sys/callb.h> + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +static kmutex_t arc_reclaim_lock; +static int arc_dead; + +/* + * Note that buffers can be on one of 5 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru_top - recently used, currently cached + * ARC_mru_bot - recentely used, no longer in cache + * ARC_mfu_top - frequently used, currently cached + * ARC_mfu_bot - frequently used, no longer in cache + * When there are no active references to the buffer, they + * are linked onto one of the lists in arc. These are the + * only buffers that can be evicted or deleted. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru_top + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru_top list. + */ + +typedef struct arc_state { + list_t list; /* linked list of evictable buffer in state */ + uint64_t lsize; /* total size of buffers in the linked list */ + uint64_t size; /* total size of all buffers in this state */ + uint64_t hits; + kmutex_t mtx; +} arc_state_t; + +/* The 5 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru_top; +static arc_state_t ARC_mru_bot; +static arc_state_t ARC_mfu_top; +static arc_state_t ARC_mfu_bot; + +static struct arc { + arc_state_t *anon; + arc_state_t *mru_top; + arc_state_t *mru_bot; + arc_state_t *mfu_top; + arc_state_t *mfu_bot; + uint64_t size; /* Actual total arc size */ + uint64_t p; /* Target size (in bytes) of mru_top */ + uint64_t c; /* Target size of cache (in bytes) */ + uint64_t c_min; /* Minimum target cache size */ + uint64_t c_max; /* Maximum target cache size */ + uint64_t incr; /* Size by which to increment arc.c */ + int64_t size_check; + + /* performance stats */ + uint64_t hits; + uint64_t misses; + uint64_t deleted; + uint64_t skipped; + uint64_t hash_elements; + uint64_t hash_elements_max; + uint64_t hash_collisions; + uint64_t hash_chains; + uint32_t hash_chain_max; + + int no_grow; /* Don't try to grow cache size */ +} arc; + +/* Default amount to grow arc.incr */ +static int64_t arc_incr_size = 1024; + +/* > 0 ==> time to increment arc.c */ +static int64_t arc_size_check_default = -1000; + +static uint64_t arc_tempreserve; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + arc_done_func_t *acb_done; + void *acb_private; + arc_byteswap_func_t *acb_byteswap; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +struct arc_buf_hdr { + /* immutable */ + uint64_t b_size; + spa_t *b_spa; + + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + + kcondvar_t b_cv; + arc_callback_t *b_acb; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; +}; + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ + +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 64 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(buf) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) + +uint64_t zfs_crc64_table[256]; + +static uint64_t +buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +{ + uintptr_t spav = (uintptr_t)spa; + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spav>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static arc_buf_hdr_t * +buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */ +static kthread_t *fbufs_lastthread; +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t max, i; + + fbufs_lastthread = curthread; + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (i < sizeof (fbufs) / sizeof (fbufs[0])) + fbufs[i] = fbuf; + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + + /* collect some hash table performance data */ + if (i > 0) { + atomic_add_64(&arc.hash_collisions, 1); + if (i == 1) + atomic_add_64(&arc.hash_chains, 1); + } + while (i > (max = arc.hash_chain_max) && + max != atomic_cas_32(&arc.hash_chain_max, max, i)) { + continue; + } + atomic_add_64(&arc.hash_elements, 1); + if (arc.hash_elements > arc.hash_elements_max) + atomic_add_64(&arc.hash_elements_max, 1); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + + /* collect some hash table performance data */ + atomic_add_64(&arc.hash_elements, -1); + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + atomic_add_64(&arc.hash_chains, -1); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); +} + +void arc_kmem_reclaim(void); + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + arc_kmem_reclaim(); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 10; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 4k block size. The table will take up + * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte + * pointers). + */ + while (hsize * 4096 < physmem * PAGESIZE) + hsize <<= 1; + + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP); + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +#define ARC_TAG (void *)0x05201962 + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc.anon)) { + + ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); + mutex_enter(&ab->b_state->mtx); + ASSERT(!refcount_is_zero(&ab->b_refcnt)); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&ab->b_state->list, ab); + ASSERT3U(ab->b_state->lsize, >=, ab->b_size); + ab->b_state->lsize -= ab->b_size; + mutex_exit(&ab->b_state->mtx); + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + + ASSERT(MUTEX_HELD(hash_lock)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (ab->b_state != arc.anon)) { + + ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); + mutex_enter(&ab->b_state->mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&ab->b_state->list, ab); + ASSERT(ab->b_buf != NULL); + ab->b_state->lsize += ab->b_size; + mutex_exit(&ab->b_state->mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, + kmutex_t *hash_lock) +{ + arc_buf_t *buf; + + ASSERT(MUTEX_HELD(hash_lock)); + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcount_is_zero(&ab->b_refcnt)) { + if (ab->b_state != arc.anon) { + int drop_mutex = FALSE; + + if (!MUTEX_HELD(&ab->b_state->mtx)) { + mutex_enter(&ab->b_state->mtx); + drop_mutex = TRUE; + } + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&ab->b_state->list, ab); + ASSERT3U(ab->b_state->lsize, >=, ab->b_size); + ab->b_state->lsize -= ab->b_size; + if (drop_mutex) + mutex_exit(&ab->b_state->mtx); + } + if (new_state != arc.anon) { + int drop_mutex = FALSE; + + if (!MUTEX_HELD(&new_state->mtx)) { + mutex_enter(&new_state->mtx); + drop_mutex = TRUE; + } + list_insert_head(&new_state->list, ab); + ASSERT(ab->b_buf != NULL); + new_state->lsize += ab->b_size; + if (drop_mutex) + mutex_exit(&new_state->mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc.anon && ab->b_state != arc.anon) { + buf_hash_remove(ab); + } + + /* + * If this buffer isn't being transferred to the MRU-top + * state, it's safe to clear its prefetch flag + */ + if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) { + ab->b_flags &= ~ARC_PREFETCH; + } + + buf = ab->b_buf; + if (buf == NULL) { + ASSERT3U(ab->b_state->size, >=, ab->b_size); + atomic_add_64(&ab->b_state->size, -ab->b_size); + /* we should only be here if we are deleting state */ + ASSERT(new_state == arc.anon && + (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot)); + } else while (buf) { + ASSERT3U(ab->b_state->size, >=, ab->b_size); + atomic_add_64(&ab->b_state->size, -ab->b_size); + atomic_add_64(&new_state->size, ab->b_size); + buf = buf->b_next; + } + ab->b_state = new_state; +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_spa = spa; + hdr->b_state = arc.anon; + hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_next = NULL; + buf->b_data = zio_buf_alloc(size); + hdr->b_buf = buf; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + atomic_add_64(&arc.size, size); + atomic_add_64(&arc.anon->size, size); + + return (buf); +} + +static void +arc_hdr_free(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc.anon); + + if (!BUF_EMPTY(hdr)) { + /* + * We can be called with an arc state lock held, + * so we can't hold a hash lock here. + * ASSERT(not in hash table) + */ + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + } + if (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + ASSERT3U(hdr->b_size, >, 0); + zio_buf_free(buf->b_data, hdr->b_size); + atomic_add_64(&arc.size, -hdr->b_size); + ASSERT3U(arc.anon->size, >=, hdr->b_size); + atomic_add_64(&arc.anon->size, -hdr->b_size); + ASSERT3P(buf->b_next, ==, NULL); + kmem_cache_free(buf_cache, buf); + hdr->b_buf = NULL; + } + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int freeable; + + mutex_enter(hash_lock); + if (remove_reference(hdr, hash_lock, tag) > 0) { + arc_buf_t **bufp = &hdr->b_buf; + arc_state_t *state = hdr->b_state; + uint64_t size = hdr->b_size; + + ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr)); + while (*bufp != buf) { + ASSERT(*bufp); + bufp = &(*bufp)->b_next; + } + *bufp = buf->b_next; + mutex_exit(hash_lock); + zio_buf_free(buf->b_data, size); + atomic_add_64(&arc.size, -size); + kmem_cache_free(buf_cache, buf); + ASSERT3U(state->size, >=, size); + atomic_add_64(&state->size, -size); + return; + } + + /* don't free buffers that are in the middle of an async write */ + freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL); + mutex_exit(hash_lock); + + if (freeable) + arc_hdr_free(hdr); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + */ +static uint64_t +arc_evict_state(arc_state_t *state, int64_t bytes) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + + ASSERT(state == arc.mru_top || state == arc.mfu_top); + + if (state == arc.mru_top) + evicted_state = arc.mru_bot; + else + evicted_state = arc.mfu_bot; + + mutex_enter(&state->mtx); + mutex_enter(&evicted_state->mtx); + + for (ab = list_tail(&state->list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->list, ab); + hash_lock = HDR_LOCK(ab); + if (mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + arc_change_state(evicted_state, ab, hash_lock); + zio_buf_free(ab->b_buf->b_data, ab->b_size); + atomic_add_64(&arc.size, -ab->b_size); + ASSERT3P(ab->b_buf->b_next, ==, NULL); + kmem_cache_free(buf_cache, ab->b_buf); + ab->b_buf = NULL; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + bytes_evicted += ab->b_size; + mutex_exit(hash_lock); + if (bytes_evicted >= bytes) + break; + } else { + atomic_add_64(&arc.skipped, 1); + } + } + mutex_exit(&evicted_state->mtx); + mutex_exit(&state->mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + return (bytes_evicted); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_delete_state(arc_state_t *state, int64_t bytes) +{ + uint_t bufs_skipped = 0; + uint64_t bytes_deleted = 0; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + +top: + mutex_enter(&state->mtx); + for (ab = list_tail(&state->list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->list, ab); + hash_lock = HDR_LOCK(ab); + if (mutex_tryenter(hash_lock)) { + arc_change_state(arc.anon, ab, hash_lock); + mutex_exit(hash_lock); + atomic_add_64(&arc.deleted, 1); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + bytes_deleted += ab->b_size; + arc_hdr_free(ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else { + if (bytes < 0) { + mutex_exit(&state->mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + bufs_skipped += 1; + } + } + mutex_exit(&state->mtx); + + if (bufs_skipped) { + atomic_add_64(&arc.skipped, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t top_sz, mru_over, arc_over; + + top_sz = arc.anon->size + arc.mru_top->size; + + if (top_sz > arc.p && arc.mru_top->lsize > 0) { + int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p); + (void) arc_evict_state(arc.mru_top, toevict); + top_sz = arc.anon->size + arc.mru_top->size; + } + + mru_over = top_sz + arc.mru_bot->size - arc.c; + + if (mru_over > 0) { + if (arc.mru_bot->lsize > 0) { + int64_t todelete = MIN(arc.mru_bot->lsize, mru_over); + arc_delete_state(arc.mru_bot, todelete); + } + } + + if ((arc_over = arc.size - arc.c) > 0) { + int64_t table_over; + + if (arc.mfu_top->lsize > 0) { + int64_t toevict = MIN(arc.mfu_top->lsize, arc_over); + (void) arc_evict_state(arc.mfu_top, toevict); + } + + table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize + - arc.c*2; + + if (table_over > 0 && arc.mfu_bot->lsize > 0) { + int64_t todelete = MIN(arc.mfu_bot->lsize, table_over); + arc_delete_state(arc.mfu_bot, todelete); + } + } +} + +/* + * Flush all *evictable* data from the cache. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(void) +{ + arc_delete_state(arc.mru_top, -1); + arc_delete_state(arc.mfu_top, -1); + + arc_delete_state(arc.mru_bot, -1); + arc_delete_state(arc.mfu_bot, -1); +} + +void +arc_kmem_reclaim(void) +{ + /* Remove 6.25% */ + /* + * We need arc_reclaim_lock because we don't want multiple + * threads trying to reclaim concurrently. + */ + + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). So we set a flag to prevent + * accessing the destroyed mutexes and lists. + */ + if (arc_dead) + return; + + mutex_enter(&arc_reclaim_lock); + + atomic_add_64(&arc.c, -(arc.c >> 4)); + if (arc.c < arc.c_min) + arc.c = arc.c_min; + atomic_add_64(&arc.p, -(arc.p >> 4)); + + arc_adjust(); + + /* Cool it for a while */ + arc.incr = 0; + arc.size_check = arc_size_check_default << 3; + + mutex_exit(&arc_reclaim_lock); +} + +static int +arc_reclaim_needed(void) +{ + uint64_t extra; + +#ifdef _KERNEL + /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * free) + */ +#if defined(__i386) + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + + /* + * an agressive reclamation will shrink the cache size as well as reap + * free kmem buffers. The arc_kmem_reclaim function is called when the + * header-cache is reaped, so we only reap the header cache if we're + * performing an agressive reclaim. If we're not, just clean the kmem + * buffer caches. + */ + if (strat == ARC_RECLAIM_AGGR) + kmem_cache_reap_now(hdr_cache); + + kmem_cache_reap_now(buf_cache); + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + } +} + +static void +arc_reclaim_thread(void) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc.no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc.no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = lbolt + (arc_grow_retry * hz); + + arc_kmem_reap_now(last_reclaim); + + } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { + arc.no_grow = FALSE; + } + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, (lbolt + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +static void +arc_try_grow(int64_t bytes) +{ + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + atomic_add_64((uint64_t *)&arc.size_check, 1); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc.no_grow) + return; + + /* + * return true if we successfully grow, or if there's enough space that + * we don't have to grow. Above, we return false if we can't grow, or + * if we shouldn't because a reclaim is in progress. + */ + if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) { + if (arc.size_check > 0) { + arc.size_check = arc_size_check_default; + atomic_add_64(&arc.incr, arc_incr_size); + } + atomic_add_64(&arc.c, MIN(bytes, arc.incr)); + if (arc.c > arc.c_max) + arc.c = arc.c_max; + else + atomic_add_64(&arc.p, MIN(bytes, arc.incr)); + } else if (arc.size > arc.c) { + if (arc.size_check > 0) { + arc.size_check = arc_size_check_default; + atomic_add_64(&arc.incr, arc_incr_size); + } + atomic_add_64(&arc.c, MIN(bytes, arc.incr)); + if (arc.c > arc.c_max) + arc.c = arc.c_max; + else + atomic_add_64(&arc.p, MIN(bytes, arc.incr)); + } +} + +/* + * check if the cache has reached its limits and eviction is required prior to + * insert. In this situation, we want to evict if no_grow is set Otherwise, the + * cache is either big enough that we can insert, or a arc_try_grow will result + * in more space being made available. + */ + +static int +arc_evict_needed() +{ + + if (arc_reclaim_needed()) + return (1); + + if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c)) + return (1); + + return (0); +} + +/* + * The state, supplied as the first argument, is going to have something + * inserted on its behalf. So, determine which cache must be victimized to + * satisfy an insertion for this state. We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_evict_for_state(arc_state_t *state, uint64_t bytes) +{ + uint64_t mru_used; + uint64_t mfu_space; + uint64_t evicted; + + ASSERT(state == arc.mru_top || state == arc.mfu_top); + + if (state == arc.mru_top) { + mru_used = arc.anon->size + arc.mru_top->size; + if (arc.p > mru_used) { + /* case 1 */ + evicted = arc_evict_state(arc.mfu_top, bytes); + if (evicted < bytes) { + arc_adjust(); + } + } else { + /* case 2 */ + evicted = arc_evict_state(arc.mru_top, bytes); + if (evicted < bytes) { + arc_adjust(); + } + } + } else { + /* MFU_top case */ + mfu_space = arc.c - arc.p; + if (mfu_space > arc.mfu_top->size) { + /* case 3 */ + evicted = arc_evict_state(arc.mru_top, bytes); + if (evicted < bytes) { + arc_adjust(); + } + } else { + /* case 4 */ + evicted = arc_evict_state(arc.mfu_top, bytes); + if (evicted < bytes) { + arc_adjust(); + } + } + } +} + +/* + * This routine is called whenever a buffer is accessed. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + int blksz, mult; + + ASSERT(MUTEX_HELD(hash_lock)); + + blksz = buf->b_size; + + if (buf->b_state == arc.anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + arc_try_grow(blksz); + if (arc_evict_needed()) { + arc_evict_for_state(arc.mru_top, blksz); + } + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *, + buf); + arc_change_state(arc.mru_top, buf, hash_lock); + + /* + * If we are using less than 2/3 of our total target + * cache size, bump up the target size for the MRU + * list. + */ + if (arc.size < arc.c*2/3) { + arc.p = arc.anon->size + arc.mru_top->size + arc.c/6; + } + + } else if (buf->b_state == arc.mru_top) { + /* + * If this buffer is in the MRU-top state and has the prefetch + * flag, the first read was actually part of a prefetch. In + * this situation, we simply want to clear the flag and return. + * A subsequent access should bump this into the MFU state. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + buf->b_flags &= ~ARC_PREFETCH; + atomic_add_64(&arc.mru_top->hits, 1); + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (lbolt > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu_top, + arc_buf_hdr_t *, buf); + arc_change_state(arc.mfu_top, buf, hash_lock); + } + atomic_add_64(&arc.mru_top->hits, 1); + } else if (buf->b_state == arc.mru_bot) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc.mru_top; + DTRACE_PROBE1(new_state__mru_top, + arc_buf_hdr_t *, buf); + } else { + new_state = arc.mfu_top; + DTRACE_PROBE1(new_state__mfu_top, + arc_buf_hdr_t *, buf); + } + + arc_try_grow(blksz); + if (arc_evict_needed()) { + arc_evict_for_state(new_state, blksz); + } + + /* Bump up the target size of the MRU list */ + mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ? + 1 : (arc.mfu_bot->size/arc.mru_bot->size)); + arc.p = MIN(arc.c, arc.p + blksz * mult); + + buf->b_arc_access = lbolt; + arc_change_state(new_state, buf, hash_lock); + + atomic_add_64(&arc.mru_bot->hits, 1); + } else if (buf->b_state == arc.mfu_top) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: the add_reference() that occurred when we did + * the arc_read() should have kicked this off the list, + * so even if it was a prefetch, it will be put back at + * the head of the list when we remove_reference(). + */ + atomic_add_64(&arc.mfu_top->hits, 1); + } else if (buf->b_state == arc.mfu_bot) { + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + arc_try_grow(blksz); + if (arc_evict_needed()) { + arc_evict_for_state(arc.mfu_top, blksz); + } + + /* Bump up the target size for the MFU list */ + mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ? + 1 : (arc.mru_bot->size/arc.mfu_bot->size)); + arc.p = MAX(0, (int64_t)arc.p - blksz * mult); + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu_top, + arc_buf_hdr_t *, buf); + arc_change_state(arc.mfu_top, buf, hash_lock); + + atomic_add_64(&arc.mfu_bot->hits, 1); + } else { + ASSERT(!"invalid arc state"); + } + +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + arc_buf_free(buf, arg); +} + +/* a generic arc_done_func_t which you can use */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + arc_buf_free(buf, arg); + *bufp = NULL; + } else { + *bufp = buf; + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + if (!HDR_FREED_IN_READ(hdr)) { + arc_buf_hdr_t *found; + + found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + /* + * Buffer was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. + */ + + ASSERT(found); + ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))); + } + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) + callback_list->acb_byteswap(buf->b_data, hdr->b_size); + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) { + abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); + abuf->b_data = zio_buf_alloc(hdr->b_size); + atomic_add_64(&arc.size, hdr->b_size); + bcopy(buf->b_data, abuf->b_data, hdr->b_size); + abuf->b_hdr = hdr; + abuf->b_next = hdr->b_buf; + hdr->b_buf = abuf; + atomic_add_64(&hdr->b_state->size, hdr->b_size); + } + acb->acb_buf = abuf; + abuf = NULL; + } else { + /* + * The caller did not provide a callback function. + * In this case, we should just remove the reference. + */ + if (HDR_FREED_IN_READ(hdr)) { + ASSERT3P(hdr->b_state, ==, arc.anon); + (void) refcount_remove(&hdr->b_refcnt, + acb->acb_private); + } else { + (void) remove_reference(hdr, hash_lock, + acb->acb_private); + } + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc.anon) + arc_change_state(arc.anon, hdr, hash_lock); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + if (!HDR_FREED_IN_READ(hdr)) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + if (zio->io_error == 0 && hdr->b_state == arc.anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc.anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + cv_broadcast(&hdr->b_cv); + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_free(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + */ +int +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t arc_flags) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + +top: + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (hdr && hdr->b_buf) { + + ASSERT((hdr->b_state == arc.mru_top) || + (hdr->b_state == arc.mfu_top) || + ((hdr->b_state == arc.anon) && + (HDR_IO_IN_PROGRESS(hdr)))); + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if ((arc_flags & ARC_NOWAIT) && done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } else if (arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + + mutex_exit(hash_lock); + return (0); + } + + /* + * If there is already a reference on this block, create + * a new copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + + if (done) + add_reference(hdr, hash_lock, private); + if (done && refcount_count(&hdr->b_refcnt) > 1) { + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_data = zio_buf_alloc(hdr->b_size); + ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1); + atomic_add_64(&arc.size, hdr->b_size); + bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size); + buf->b_hdr = hdr; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + atomic_add_64(&hdr->b_state->size, hdr->b_size); + } else { + buf = hdr->b_buf; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + atomic_add_64(&arc.hits, 1); + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + + buf = arc_buf_alloc(spa, size, private); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = bp->blk_birth; + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + arc_buf_free(buf, private); + goto top; /* restart the IO request */ + } + + } else { + /* this block is in the ghost cache */ + ASSERT((hdr->b_state == arc.mru_bot) || + (hdr->b_state == arc.mfu_bot)); + add_reference(hdr, hash_lock, private); + + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_data = zio_buf_alloc(hdr->b_size); + atomic_add_64(&arc.size, hdr->b_size); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + buf->b_hdr = hdr; + buf->b_next = NULL; + hdr->b_buf = buf; + } + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + + /* + * If this DVA is part of a prefetch, mark the buf + * header with the prefetch flag + */ + if (arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + /* + * If the buffer has been evicted, migrate it to a present state + * before issuing the I/O. Once we drop the hash-table lock, + * the header will be marked as I/O in progress and have an + * attached buffer. At this point, anybody who finds this + * buffer ought to notice that it's legit but has a pending I/O. + */ + + if ((hdr->b_state == arc.mru_bot) || + (hdr->b_state == arc.mfu_bot)) + arc_access(hdr, hash_lock); + + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE2(arc__miss, blkptr_t *, bp, + uint64_t, size); + atomic_add_64(&arc.misses, 1); + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, flags); + + if (arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +/* + * arc_read() variant to support pool traversal. If the block is already + * in the ARC, make a copy of it; otherwise, the caller will do the I/O. + * The idea is that we don't want pool traversal filling up memory, but + * if the ARC already has the data anyway, we shouldn't pay for the I/O. + */ +int +arc_tryread(spa_t *spa, blkptr_t *bp, void *data) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_mtx; + int rc = 0; + + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + + if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr)) + bcopy(hdr->b_buf->b_data, data, hdr->b_size); + else + rc = ENOENT; + + if (hash_mtx) + mutex_exit(hash_mtx); + + return (rc); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * make a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + + if (hdr->b_state == arc.anon) { + /* this buffer is already released */ + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + ASSERT(BUF_EMPTY(hdr)); + return; + } + + mutex_enter(hash_lock); + + if (refcount_count(&hdr->b_refcnt) > 1) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + spa_t *spa = hdr->b_spa; + + /* + * Pull the data off of this buf and attach it to + * a new anonymous buf. + */ + bufp = &hdr->b_buf; + while (*bufp != buf) { + ASSERT(*bufp); + bufp = &(*bufp)->b_next; + } + *bufp = (*bufp)->b_next; + (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT3U(hdr->b_state->size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->size, -hdr->b_size); + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_buf = buf; + nhdr->b_state = arc.anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = 0; + buf->b_hdr = nhdr; + buf->b_next = NULL; + (void) refcount_add(&nhdr->b_refcnt, tag); + atomic_add_64(&arc.anon->size, blksz); + + hdr = nhdr; + } else { + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + arc_change_state(arc.anon, hdr, hash_lock); + hdr->b_arc_access = 0; + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + } +} + +int +arc_released(arc_buf_t *buf) +{ + return (buf->b_hdr->b_state == arc.anon); +} + +static void +arc_write_done(zio_t *zio) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr; + arc_callback_t *acb; + + buf = zio->io_private; + hdr = buf->b_hdr; + acb = hdr->b_acb; + hdr->b_acb = NULL; + + /* this buffer is on no lists and is not in the hash table */ + ASSERT3P(hdr->b_state, ==, arc.anon); + + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = zio->io_bp->blk_birth; + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + /* clear the "in-write" flag */ + hdr->b_hash_next = NULL; + /* This write may be all-zero */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), + BP_IDENTITY(zio->io_bp))); + ASSERT3U(zio->io_bp_orig.blk_birth, ==, + zio->io_bp->blk_birth); + + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc.anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_free(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } + if (acb && acb->acb_done) { + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + acb->acb_done(zio, buf, acb->acb_private); + } + + if (acb) + kmem_free(acb, sizeof (arc_callback_t)); +} + +int +arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, + uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t arc_flags) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_callback_t *acb; + zio_t *rzio; + + /* this is a private buffer - no locking required */ + ASSERT3P(hdr->b_state, ==, arc.anon); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(!HDR_IO_ERROR(hdr)); + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = (arc_byteswap_func_t *)-1; + hdr->b_acb = acb; + rzio = zio_write(pio, spa, checksum, compress, txg, bp, + buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags); + + if (arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + + return (0); +} + +int +arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + arc_buf_hdr_t *ab; + kmutex_t *hash_lock; + zio_t *zio; + + /* + * If this buffer is in the cache, release it, so it + * can be re-used. + */ + ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (ab != NULL) { + /* + * The checksum of blocks to free is not always + * preserved (eg. on the deadlist). However, if it is + * nonzero, it should match what we have in the cache. + */ + ASSERT(bp->blk_cksum.zc_word[0] == 0 || + ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + arc_change_state(arc.anon, ab, hash_lock); + if (refcount_is_zero(&ab->b_refcnt)) { + mutex_exit(hash_lock); + arc_hdr_free(ab); + atomic_add_64(&arc.deleted, 1); + } else { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1); + if (HDR_IO_IN_PROGRESS(ab)) + ab->b_flags |= ARC_FREED_IN_READ; + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + mutex_exit(hash_lock); + } + } + + zio = zio_free(pio, spa, txg, bp, done, private); + + if (arc_flags & ARC_WAIT) + return (zio_wait(zio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(zio); + + return (0); +} + +void +arc_tempreserve_clear(uint64_t tempreserve) +{ + atomic_add_64(&arc_tempreserve, -tempreserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t tempreserve) +{ +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + /* + * XXX This is kind of hacky. The limit should be adjusted + * dynamically to keep the time to sync a dataset fixed (around + * 1-5 seconds?). + * Maybe should have some sort of locking? If two requests come + * in concurrently, we might let them both succeed, when one of + * them should fail. Not a huge deal. + */ + + ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */ + + if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon=%lluK " + "tempreserve=%lluK arc.c=%lluK\n", + arc_tempreserve>>10, arc.anon->lsize>>10, + tempreserve>>10, arc.c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, tempreserve); + return (0); +} + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + + /* Start out with 1/8 of all memory */ + arc.c = physmem * PAGESIZE / 8; + +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif + + /* use at least 1/32 of all memory, or 32MB, whichever is more */ + arc.c_min = MAX(arc.c / 4, 64<<20); + /* use at most 3/4 of all memory, or all but 1GB, whichever is more */ + if (arc.c * 8 >= 1<<30) + arc.c_max = (arc.c * 8) - (1<<30); + else + arc.c_max = arc.c_min; + arc.c_max = MAX(arc.c * 6, arc.c_max); + arc.c = arc.c_max; + arc.p = (arc.c >> 1); + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc.c = arc.c / 2; + if (arc.c < arc.c_min) + arc.c = arc.c_min; + + arc.anon = &ARC_anon; + arc.mru_top = &ARC_mru_top; + arc.mru_bot = &ARC_mru_bot; + arc.mfu_top = &ARC_mfu_top; + arc.mfu_bot = &ARC_mfu_bot; + + list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(); + + arc_dead = TRUE; + + mutex_destroy(&arc_reclaim_lock); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc.mru_top->list); + list_destroy(&arc.mru_bot->list); + list_destroy(&arc.mfu_top->list); + list_destroy(&arc.mfu_bot->list); + + buf_fini(); +} diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c new file mode 100644 index 0000000000..68f79ac5a2 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/bplist.c @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/bplist.h> +#include <sys/zfs_context.h> + +static void +bplist_hold(bplist_t *bpl) +{ + ASSERT(MUTEX_HELD(&bpl->bpl_lock)); + if (bpl->bpl_dbuf == NULL) { + bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos, + bpl->bpl_object, bpl); + dmu_buf_read(bpl->bpl_dbuf); + bpl->bpl_phys = bpl->bpl_dbuf->db_data; + } +} + +uint64_t +bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) +{ + uint64_t obj; + + obj = dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, + DMU_OT_BPLIST_HDR, sizeof (bplist_phys_t), tx); + + return (obj); +} + +void +bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + VERIFY(dmu_object_free(mos, object, tx) == 0); +} + +void +bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) +{ + dmu_object_info_t doi; + + VERIFY(dmu_object_info(mos, object, &doi) == 0); + + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_dbuf == NULL); + ASSERT(bpl->bpl_phys == NULL); + ASSERT(bpl->bpl_cached_dbuf == NULL); + ASSERT(bpl->bpl_queue == NULL); + ASSERT(object != 0); + + bpl->bpl_mos = mos; + bpl->bpl_object = object; + bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); + bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; + + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_close(bplist_t *bpl) +{ + mutex_enter(&bpl->bpl_lock); + + ASSERT(bpl->bpl_queue == NULL); + + if (bpl->bpl_cached_dbuf) { + dmu_buf_rele(bpl->bpl_cached_dbuf); + bpl->bpl_cached_dbuf = NULL; + } + if (bpl->bpl_dbuf) { + dmu_buf_rele_tag(bpl->bpl_dbuf, bpl); + bpl->bpl_dbuf = NULL; + bpl->bpl_phys = NULL; + } + + mutex_exit(&bpl->bpl_lock); +} + +boolean_t +bplist_empty(bplist_t *bpl) +{ + boolean_t rv; + + if (bpl->bpl_object == 0) + return (B_TRUE); + + mutex_enter(&bpl->bpl_lock); + bplist_hold(bpl); + rv = (bpl->bpl_phys->bpl_entries == 0); + mutex_exit(&bpl->bpl_lock); + + return (rv); +} + +int +bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) +{ + uint64_t blk, off; + blkptr_t *bparray; + dmu_buf_t *db; + + mutex_enter(&bpl->bpl_lock); + bplist_hold(bpl); + + if (*itorp >= bpl->bpl_phys->bpl_entries) { + mutex_exit(&bpl->bpl_lock); + return (ENOENT); + } + + blk = *itorp >> bpl->bpl_bpshift; + off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); + db = bpl->bpl_cached_dbuf; + + if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) { + if (db != NULL) + dmu_buf_rele(db); + bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos, + bpl->bpl_object, blk << bpl->bpl_blockshift); + } + + ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift); + + dmu_buf_read(db); + bparray = db->db_data; + *bp = bparray[off]; + (*itorp)++; + mutex_exit(&bpl->bpl_lock); + return (0); +} + +void +bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) +{ + uint64_t blk, off; + blkptr_t *bparray; + dmu_buf_t *db; + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + bplist_hold(bpl); + + blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; + off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); + db = bpl->bpl_cached_dbuf; + + if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) { + if (db != NULL) + dmu_buf_rele(db); + bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos, + bpl->bpl_object, blk << bpl->bpl_blockshift); + } + + ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift); + + dmu_buf_will_dirty(db, tx); + bparray = db->db_data; + bparray[off] = *bp; + + /* We never need the fill count. */ + bparray[off].blk_fill = 0; + + /* The bplist will compress better if we can leave off the checksum */ + bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); + + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + bpl->bpl_phys->bpl_entries++; + bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp); + mutex_exit(&bpl->bpl_lock); +} + +/* + * Deferred entry; will be written later by bplist_sync(). + */ +void +bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp) +{ + bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); + + ASSERT(!BP_IS_HOLE(bp)); + mutex_enter(&bpl->bpl_lock); + bpq->bpq_blk = *bp; + bpq->bpq_next = bpl->bpl_queue; + bpl->bpl_queue = bpq; + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_sync(bplist_t *bpl, dmu_tx_t *tx) +{ + bplist_q_t *bpq; + + mutex_enter(&bpl->bpl_lock); + while ((bpq = bpl->bpl_queue) != NULL) { + bpl->bpl_queue = bpq->bpq_next; + mutex_exit(&bpl->bpl_lock); + bplist_enqueue(bpl, &bpq->bpq_blk, tx); + kmem_free(bpq, sizeof (*bpq)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) +{ + mutex_enter(&bpl->bpl_lock); + ASSERT3P(bpl->bpl_queue, ==, NULL); + bplist_hold(bpl); + dmu_buf_will_dirty(bpl->bpl_dbuf, tx); + dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx); + bpl->bpl_phys->bpl_entries = 0; + bpl->bpl_phys->bpl_bytes = 0; + mutex_exit(&bpl->bpl_lock); +} diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c new file mode 100644 index 0000000000..e4b2d7f9e6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -0,0 +1,2022 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static void dbuf_destroy(dmu_buf_impl_t *db); +static void dbuf_verify(dmu_buf_impl_t *db); +static void dbuf_evict_user(dmu_buf_impl_t *db); +static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static arc_done_func_t dbuf_read_done; +static arc_done_func_t dbuf_write_done; + +/* + * Global data structures and functions for the dbuf cache. + */ +taskq_t *dbuf_tq; +static kmem_cache_t *dbuf_cache; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + refcount_create(&db->db_holds); + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + cv_destroy(&db->db_changed); + refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); + + return (crc); +} + +#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = dn->dn_objset; + uint64_t obj = dn->dn_object; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *db; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (!refcount_is_zero(&db->db_holds)) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_impl_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid = db->db_blkid; + uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (!refcount_is_zero(&dbf->db_holds)) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, 1); + + return (NULL); +} + +/* + * Remove an entry from the hash table. This operation will + * fail if there are any existing holds on the db. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + uint64_t idx = hv & h->hash_table_mask; + dmu_buf_impl_t *dbf, **dbp; + + /* + * We musn't hold db_mtx to maintin lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(refcount_is_zero(&db->db_holds)); + ASSERT(db->db_dnode != NULL); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_add_64(&dbuf_hash_count, -1); +} + +static int dbuf_evictable(dmu_buf_impl_t *db); +static void dbuf_clear(dmu_buf_impl_t *db); + +void +dbuf_evict(dmu_buf_impl_t *db) +{ + int err; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + err = dbuf_evictable(db); + ASSERT(err == TRUE); + dbuf_clear(db); + dbuf_destroy(db); +} + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0 || db->db_d.db_evict_func == NULL) + return; + + if (db->db_d.db_user_data_ptr_ptr) + *db->db_d.db_user_data_ptr_ptr = db->db.db_data; + db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); + db->db_d.db_user_ptr = NULL; + db->db_d.db_user_data_ptr_ptr = NULL; + db->db_d.db_evict_func = NULL; +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64k block size. The table will take up + * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte + * pointers). + */ + while (hsize * 65536 < physmem * PAGESIZE) + hsize <<= 1; + + h->hash_table_mask = hsize - 1; + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + + dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + taskq_destroy(dbuf_tq); + dbuf_tq = NULL; + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); + kmem_cache_destroy(dbuf_cache); +} + +/* + * Other stuff. + */ + +static void +dbuf_verify(dmu_buf_impl_t *db) +{ +#ifdef ZFS_DEBUG + int i; + dnode_t *dn = db->db_dnode; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT(list_head(&dn->dn_dbufs)); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + } + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + if (db->db_level == 0) { + void **udpp = db->db_d.db_user_data_ptr_ptr; + /* we can be momentarily larger in dnode_set_blksz() */ + if (db->db_blkid != DB_BONUS_BLKID && dn) { + ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); + } + if (udpp) { + ASSERT((refcount_is_zero(&db->db_holds) && + *udpp == NULL) || + (!refcount_is_zero(&db->db_holds) && + *udpp == db->db.db_data)); + } + + if (IS_DNODE_DNODE(db->db.db_object)) { + for (i = 0; i < TXG_SIZE; i++) { + /* + * it should only be modified in syncing + * context, so make sure we only have + * one copy of the data. + */ + ASSERT(db->db_d.db_data_old[i] == NULL || + db->db_d.db_data_old[i] == db->db_buf); + } + } + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (IS_DNODE_DNODE(db->db.db_object)) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the struct_rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + */ + if (db->db_dirtycnt == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } + } +#endif +} + +static void +dbuf_update_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { + ASSERT(!refcount_is_zero(&db->db_holds)); + *db->db_d.db_user_data_ptr_ptr = db->db.db_data; + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf->b_data != NULL); + db->db_buf = buf; + db->db.db_data = buf->b_data; + dbuf_update_data(db); +} + +uint64_t +dbuf_whichblock(dnode_t *dn, uint64_t offset) +{ + if (dn->dn_datablkshift) { + return (offset >> dn->dn_datablkshift); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +static void +dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db.db_data == NULL); + if (db->db_level == 0 && db->db_d.db_freed_in_flight) { + /* we were freed in flight; disregard any error */ + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + db->db_d.db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else if (zio == NULL || zio->io_error == 0) { + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + } else { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + arc_buf_free(buf, db); + db->db_state = DB_UNCACHED; + ASSERT3P(db->db_buf, ==, NULL); + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); +} + +void +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + arc_buf_t *buf; + blkptr_t *bp; + + ASSERT(!refcount_is_zero(&db->db_holds)); + /* We need the struct_rwlock to prevent db_blkptr from changing. */ + ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + + /* + * prefetch only data blocks (level 0) -- don't prefetch indirect + * blocks + */ + if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { + flags |= DB_RF_NOPREFETCH; + } + + if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size); + } + + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + return; + } + + mutex_enter(&db->db_mtx); + + if (db->db_state != DB_UNCACHED) { + mutex_exit(&db->db_mtx); + return; + } + + ASSERT3U(db->db_state, ==, DB_UNCACHED); + + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + DN_MAX_BONUSLEN, db); + if (db->db.db_size < DN_MAX_BONUSLEN) + bzero(buf->b_data, DN_MAX_BONUSLEN); + bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, + db->db.db_size); + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) + bp = NULL; + else + bp = db->db_blkptr; + + if (bp == NULL) + dprintf_dbuf(db, "blkptr: %s\n", "NULL"); + else + dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); + + if (bp == NULL || BP_IS_HOLE(bp)) { + ASSERT(bp == NULL || BP_IS_HOLE(bp)); + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db)); + bzero(db->db.db_data, db->db.db_size); + db->db_state = DB_CACHED; + mutex_exit(&db->db_mtx); + return; + } + + db->db_state = DB_READ; + mutex_exit(&db->db_mtx); + + /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ + (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, + db->db_level > 0 ? byteswap_uint64_array : + dmu_ot[db->db_dnode->dn_type].ot_byteswap, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, + (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + ARC_NOWAIT); +} + +static int +dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) +{ + zio_t *zio; + int err; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!refcount_is_zero(&db->db_holds)); + if (db->db_state == DB_CACHED) + return (0); + + if (db->db_state == DB_UNCACHED) { + zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + dbuf_read_impl(db, zio, flags); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + err = zio_wait(zio); + if (err) + return (err); + } + + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + mutex_exit(&db->db_mtx); + + return (0); +} + +#pragma weak dmu_buf_read = dbuf_read +void +dbuf_read(dmu_buf_impl_t *db) +{ + int err; + + err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); + ASSERT(err == 0); +} + +#pragma weak dmu_buf_read_canfail = dbuf_read_canfail +int +dbuf_read_canfail(dmu_buf_impl_t *db) +{ + return (dbuf_read_generic(db, DB_RF_CANFAIL)); +} + +void +dbuf_read_havestruct(dmu_buf_impl_t *db) +{ + int err; + + ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); + ASSERT(err == 0); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + int blksz = (db->db_blkid == DB_BONUS_BLKID) ? + DN_MAX_BONUSLEN : db->db.db_size; + ASSERT(db->db.db_data == NULL); + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + blksz, db)); + db->db_state = DB_FILL; + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +/* + * This is our just-in-time copy function. It makes a copy of + * buffers, that have been modified in a previous transaction + * group, before we modify them in the current active group. + * + * This function is used in two places: when we are dirtying a + * buffer for the first time in a txg, and when we are freeing + * a range in a dnode that includes this buffer. + * + * Note that when we are called from dbuf_free_range() we do + * not put a hold on the buffer, we just traverse the active + * dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + arc_buf_t **quiescing, **syncing; + int size = (db->db_blkid == DB_BONUS_BLKID) ? + DN_MAX_BONUSLEN : db->db.db_size; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + + quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; + syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; + + /* + * If this buffer is referenced from the current quiescing + * transaction group: either make a copy and reset the reference + * to point to the copy, or (if there a no active holders) just + * null out the current db_data pointer. + */ + if (*quiescing == db->db_buf) { + /* + * If the quiescing txg is "dirty", then we better not + * be referencing the same buffer from the syncing txg. + */ + ASSERT(*syncing != db->db_buf); + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + *quiescing = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, size, db); + bcopy(db->db.db_data, (*quiescing)->b_data, size); + } else { + db->db.db_data = NULL; + db->db_buf = NULL; + db->db_state = DB_UNCACHED; + } + return; + } + + /* + * If this buffer is referenced from the current syncing + * transaction group: either + * 1 - make a copy and reset the reference, or + * 2 - if there are no holders, just null the current db_data. + */ + if (*syncing == db->db_buf) { + ASSERT3P(*quiescing, ==, NULL); + ASSERT3U(db->db_dirtycnt, ==, 1); + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + /* we can't copy if we have already started a write */ + ASSERT(*syncing != db->db_data_pending); + *syncing = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, size, db); + bcopy(db->db.db_data, (*syncing)->b_data, size); + } else { + db->db.db_data = NULL; + db->db_buf = NULL; + db->db_state = DB_UNCACHED; + } + } +} + +void +dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { + db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; + } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { + /* free this block */ + ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || + db->db_dnode->dn_free_txg == txg); + if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { + /* XXX can get silent EIO here */ + (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + txg, db->db_d.db_overridden_by[txg&TXG_MASK], + NULL, NULL, ARC_WAIT); + } + kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], + sizeof (blkptr_t)); + db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; + /* release the already-written buffer */ + arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); + } +} + +void +dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + + dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + mutex_enter(&dn->dn_dbufs_mtx); + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) + continue; + dprintf_dbuf(db, "found buf %s\n", ""); + if (db->db_blkid < blkid || + db->db_blkid >= blkid+nblks) + continue; + + /* found a level 0 buffer in the range */ + if (dbuf_undirty(db, tx)) + continue; + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_UNCACHED) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ) { + /* this will be handled in dbuf_read_done() */ + db->db_d.db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_FILL) { + /* this will be handled in dbuf_rele() */ + db->db_d.db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + + /* make a copy of the data if necessary */ + dbuf_fix_old_data(db, txg); + + if (db->db.db_data) { + /* fill in with appropriate data */ + arc_release(db->db_buf, db); + bzero(db->db.db_data, db->db.db_size); + } + mutex_exit(&db->db_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); +} + +static int +dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; + uint64_t birth_txg = 0; + + /* Don't count meta-objects */ + if (ds == NULL) + return (FALSE); + + /* + * We don't need any locking to protect db_blkptr: + * If it's syncing, then db_dirtied will be set so we'll + * ignore db_blkptr. + */ + ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ + /* If we have been dirtied since the last snapshot, its not new */ + if (db->db_dirtied) + birth_txg = db->db_dirtied; + else if (db->db_blkptr) + birth_txg = db->db_blkptr->blk_birth; + + if (birth_txg) + return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); + else + return (TRUE); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *obuf; + int osize = db->db.db_size; + + /* XXX does *this* func really need the lock? */ + ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + + ASSERT3U(osize, <=, size); + if (osize == size) + return; + + /* + * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * is OK, because there can be no other references to the db + * when we are changing its size, so no concurrent DB_FILL can + * be happening. + */ + /* Make a copy of the data if necessary */ + dbuf_will_dirty(db, tx); + + /* create the data buffer for the new block */ + buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); + + /* copy old block data to the new block */ + obuf = db->db_buf; + bcopy(obuf->b_data, buf->b_data, osize); + /* zero the remainder */ + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ + dbuf_set_data(db, buf); + arc_buf_free(obuf, db); + db->db.db_size = size; + + /* fix up the dirty info */ + if (db->db_level == 0) + db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; + mutex_exit(&db->db_mtx); + + dnode_willuse_space(db->db_dnode, size-osize, tx); +} + +void +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + int drop_struct_lock = FALSE; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + dmu_tx_dirty_buf(tx, db); + + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + * XXX We may want to prohibit dirtying in syncing context even + * if they did pre-dirty. + */ + ASSERT(!(dmu_tx_is_syncing(tx) && + !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && + !(dn->dn_object & DMU_PRIVATE_OBJECT) && + dn->dn_objset->os_dsl_dataset != NULL && + !dsl_dir_is_private( + dn->dn_objset->os_dsl_dataset->ds_dir))); + + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || + dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* XXX make this true for indirects too? */ + ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || + db->db_state == DB_FILL); + + /* + * If this buffer is currently part of an "overridden" region, + * we now need to remove it from that region. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + db->db_d.db_overridden_by[txgoff] != NULL) { + dbuf_unoverride(db, tx->tx_txg); + } + + mutex_enter(&dn->dn_mtx); + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED && + !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&dn->dn_mtx); + + /* + * If this buffer is already dirty, we're done. + */ + if (list_link_active(&db->db_dirty_node[txgoff])) { + mutex_exit(&db->db_mtx); + return; + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos, a spa os, or we're initializing the os. However, we are + * allowed to dirty in syncing context provided we already + * dirtied it in open context. Hence we must make this + * assertion only if we're not already dirty. + */ + ASSERT(!dmu_tx_is_syncing(tx) || + os->os_dsl_dataset == NULL || + !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || + !BP_IS_HOLE(&os->os_rootbp)); + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_level == 0) { + /* + * Release the data buffer from the cache so that we + * can modify it without impacting possible other users + * of this cached data block. Note that indirect blocks + * and private objects are not released until the syncing + * state (since they are only modified then). + * + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + ASSERT(db->db_buf != NULL); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_d.db_data_old[txgoff] == NULL); + if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + ASSERT(db->db_buf != NULL); + } + db->db_d.db_data_old[txgoff] = db->db_buf; + } + + mutex_enter(&dn->dn_mtx); + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + dnode_clear_range(dn, db->db_blkid, 1, tx); + db->db_d.db_freed_in_flight = FALSE; + } + + db->db_dirtied = tx->tx_txg; + list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); + mutex_exit(&dn->dn_mtx); + + /* + * If writting this buffer will consume a new block on disk, + * then update the accounting. + */ + if (db->db_blkid != DB_BONUS_BLKID) { + if (!dbuf_new_block(db, tx) && db->db_blkptr) { + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, + -BP_GET_ASIZE(db->db_blkptr), tx); + } + dnode_willuse_space(dn, db->db.db_size, tx); + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DB_BONUS_BLKID) { + dnode_setdirty(dn, tx); + return; + } + + if (db->db_level == 0) + dnode_new_blkid(dn, db->db_blkid, tx); + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (db->db_level < dn->dn_nlevels-1) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent; + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + dbuf_dirty(parent, tx); + dbuf_remove_ref(parent, FTAG); + } else { + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); +} + +static int +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(tx->tx_txg != 0); + + mutex_enter(&db->db_mtx); + + /* + * If this buffer is not dirty, we're done. + */ + if (!list_link_active(&db->db_dirty_node[txgoff])) { + mutex_exit(&db->db_mtx); + return (0); + } + + /* + * If this buffer is currently held, we cannot undirty + * it, since one of the current holders may be in the + * middle of an update. Note that users of dbuf_undirty() + * should not place a hold on the dbuf before the call. + * XXX - this check assumes we are being called from + * dbuf_free_range(), perhaps we should move it there? + */ + if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, db->db_blkid, 1, tx); + mutex_exit(&dn->dn_mtx); + return (0); + } + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + dbuf_unoverride(db, tx->tx_txg); + + ASSERT(db->db.db_size != 0); + if (db->db_level == 0) { + ASSERT(db->db_buf != NULL); + ASSERT(db->db_d.db_data_old[txgoff] != NULL); + if (db->db_d.db_data_old[txgoff] != db->db_buf) + arc_buf_free(db->db_d.db_data_old[txgoff], db); + db->db_d.db_data_old[txgoff] = NULL; + } + + /* XXX would be nice to fix up dn_towrite_space[] */ + /* XXX undo db_dirtied? but how? */ + /* db->db_dirtied = tx->tx_txg; */ + + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_dbufs[txgoff], db); + mutex_exit(&dn->dn_mtx); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (refcount_remove(&db->db_holds, + (void *)(uintptr_t)tx->tx_txg) == 0) { + /* make duf_verify() happy */ + if (db->db.db_data) + bzero(db->db.db_data, db->db.db_size); + + dbuf_evict(db); + return (1); + } + + mutex_exit(&db->db_mtx); + return (0); +} + +#pragma weak dmu_buf_will_dirty = dbuf_will_dirty +void +dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + int rf = DB_RF_MUST_SUCCEED; + + ASSERT(tx->tx_txg != 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + (void) dbuf_read_generic(db, rf); + dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_will_fill = dbuf_will_fill +void +dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!refcount_is_zero(&db->db_holds)); + + ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + dbuf_dirty(db, tx); +} + +#pragma weak dmu_buf_fill_done = dbuf_fill_done +/* ARGSUSED */ +void +dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + mutex_enter(&db->db_mtx); + dbuf_verify(db); + + if (db->db_state == DB_FILL) { + if (db->db_level == 0 && db->db_d.db_freed_in_flight) { + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_d.db_freed_in_flight = FALSE; + } + db->db_state = DB_CACHED; + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + + +static void +dbuf_clear(dmu_buf_impl_t *db) +{ + dnode_t *dn = db->db_dnode; + + ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_CACHED) { + ASSERT(db->db_buf != NULL); + arc_buf_free(db->db_buf, db); + db->db.db_data = NULL; + db->db_buf = NULL; + db->db_state = DB_UNCACHED; + } + + ASSERT3U(db->db_state, ==, DB_UNCACHED); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); + + mutex_exit(&db->db_mtx); + + /* + * If this dbuf is referened from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (db->db_parent && db->db_parent != dn->dn_dbuf) + dbuf_remove_ref(db->db_parent, db); + + /* remove from dn_dbufs */ + list_remove(&dn->dn_dbufs, db); + + dnode_rele(dn, db); + + dbuf_hash_remove(db); + + db->db_dnode = NULL; + db->db_parent = NULL; + db->db_blkptr = NULL; +} + +static int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + int nlevels, epbs; + + if (dn->dn_phys->dn_nlevels == 0) + nlevels = 1; + else + nlevels = dn->dn_phys->dn_nlevels; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (blkid == DB_BONUS_BLKID) { + /* this is the bonus buffer */ + *parentp = NULL; + *bpp = NULL; + return (0); + } else if (level >= nlevels || + (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + *parentp = NULL; + *bpp = NULL; + return (ENOENT); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, NULL, parentp); + if (err) + return (err); + dbuf_read_havestruct(*parentp); + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + *parentp = dn->dn_dbuf; + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_state = DB_UNCACHED; + + if (db->db_blkid == DB_BONUS_BLKID) { + db->db.db_size = dn->dn_bonuslen; + db->db.db_offset = DB_BONUS_BLKID; + } else { + int blocksize = + db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + db->db_dirtied = 0; + db->db_dirtycnt = 0; + + bzero(&db->db_d, sizeof (db->db_d)); + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before its added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + return (odb); + } + list_insert_head(&dn->dn_dbufs, db); + mutex_exit(&dn->dn_dbufs_mtx); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + (void) refcount_add(&dn->dn_holds, db); + + db->db_dnode = dn; + db->db_parent = parent; + db->db_blkptr = blkptr; + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +static int +dbuf_evictable(dmu_buf_impl_t *db) +{ + int i; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_verify(db); + + if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) + return (FALSE); + + if (!refcount_is_zero(&db->db_holds)) + return (FALSE); + +#ifdef ZFS_DEBUG + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&db->db_dirty_node[i])); + ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); + } +#endif + + /* + * Now we know we want to free it. + * This call must be done last, since it has side effects - + * calling the db_evict_func(). + */ + dbuf_evict_user(db); + return (TRUE); +} + +static void +dbuf_destroy(dmu_buf_impl_t *db) +{ + ASSERT(refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_dnode == NULL); + ASSERT(db->db_parent == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + + kmem_cache_free(dbuf_cache, db); +} + +void +dbuf_prefetch(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db, *parent = NULL; + blkptr_t *bp = NULL; + + ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (dnode_block_freed(dn, blkid)) + return; + + /* dbuf_find() returns with db_mtx held */ + if (db = dbuf_find(dn, 0, blkid)) { + /* + * This dbuf is already in the cache. We assume that + * it is already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } + + if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { + if (bp && !BP_IS_HOLE(bp)) { + (void) arc_read(NULL, dn->dn_objset->os_spa, bp, + dmu_ot[dn->dn_type].ot_byteswap, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + (ARC_NOWAIT | ARC_PREFETCH)); + } + if (parent && parent != dn->dn_dbuf) + dbuf_rele(parent); + } +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; + + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = ENOENT; + if (err) { + if (parent && parent != dn->dn_dbuf) + dbuf_rele(parent); + return (err); + } + } + db = dbuf_create(dn, level, blkid, parent, bp); + } + + /* + * If this buffer is currently syncing out, and we are + * are still referencing it from db_data, we need to make + * a copy of it in case we decide we want to dirty it + * again in this txg. + */ + if (db->db_level == 0 && db->db_state == DB_CACHED && + !(dn->dn_object & DMU_PRIVATE_OBJECT) && + db->db_data_pending == db->db_buf) { + int size = (db->db_blkid == DB_BONUS_BLKID) ? + DN_MAX_BONUSLEN : db->db.db_size; + + dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + size, db)); + bcopy(db->db_data_pending->b_data, db->db.db_data, + db->db.db_size); + } + + dbuf_add_ref(db, tag); + dbuf_update_data(db); + dbuf_verify(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent && parent != dn->dn_dbuf) + dbuf_rele(parent); + + ASSERT3P(db->db_dnode, ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid) +{ + dmu_buf_impl_t *db; + (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); + return (db); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (db); +} + +dmu_buf_impl_t * +dbuf_hold_bonus(dnode_t *dn, void *tag) +{ + dmu_buf_impl_t *db; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); + rw_exit(&dn->dn_struct_rwlock); + return (db); +} + +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + (void) refcount_add(&db->db_holds, tag); + /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ +} + +void +dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds; + dnode_t *dn = db->db_dnode; + int need_mutex; + + ASSERT(dn != NULL); + need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); + + if (need_mutex) { + dnode_add_ref(dn, FTAG); + mutex_enter(&dn->dn_dbufs_mtx); + } + + mutex_enter(&db->db_mtx); + dbuf_verify(db); + + holds = refcount_remove(&db->db_holds, tag); + + if (holds == 0) { + ASSERT3U(db->db_state, !=, DB_FILL); + if (db->db_level == 0 && + db->db_d.db_user_data_ptr_ptr != NULL) + *db->db_d.db_user_data_ptr_ptr = NULL; + dbuf_evict(db); + } else { + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_d.db_immediate_evict) + dbuf_evict_user(db); + mutex_exit(&db->db_mtx); + } + + if (need_mutex) { + mutex_exit(&dn->dn_dbufs_mtx); + dnode_rele(dn, FTAG); + } +} + +void +dbuf_rele(dmu_buf_impl_t *db) +{ + dbuf_remove_ref(db, NULL); +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (refcount_count(&db->db_holds)); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_d.db_immediate_evict = TRUE; + return (dmu_buf_update_user(db_fake, NULL, user_ptr, + user_data_ptr_ptr, evict_func)); +} + +void * +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_d.db_user_ptr == old_user_ptr) { + db->db_d.db_user_ptr = user_ptr; + db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_d.db_evict_func = evict_func; + + dbuf_update_data(db); + } else { + old_user_ptr = db->db_d.db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); + + return (db->db_d.db_user_ptr); +} + +void +dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) +{ + arc_buf_t **data; + uint64_t txg = tx->tx_txg; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + int blksz; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + dbuf_verify(db); + + /* + * Don't need a lock on db_dirty (dn_mtx), because it can't + * be modified yet. + */ + + if (db->db_level == 0) { + data = &db->db_d.db_data_old[txg&TXG_MASK]; + blksz = arc_buf_size(*data); + /* + * If this buffer is currently "in use" (i.e., there are + * active holds and db_data still references it), then make + * a copy before we start the write so that any modifications + * from the open txg will not leak into this write. + * + * NOTE: this copy does not need to be made for objects only + * modified in the syncing context (e.g. DNONE_DNODE blocks) + * or if there is no actual write involved (bonus blocks). + */ + if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && + db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && + db->db_blkid != DB_BONUS_BLKID) { + if (refcount_count(&db->db_holds) > 1 && + *data == db->db_buf) { + *data = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, blksz, db); + bcopy(db->db.db_data, (*data)->b_data, blksz); + } + db->db_data_pending = *data; + } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(db->db_buf, db); + } + } else { + data = &db->db_buf; + if (*data == NULL) { + /* + * This can happen if we dirty and then free + * the level-0 data blocks in the same txg. So + * this indirect remains unchanged. + */ + if (db->db_dirtied == txg) + db->db_dirtied = 0; + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_remove_ref(db, (void *)(uintptr_t)txg); + return; + } + blksz = db->db.db_size; + ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); + } + + ASSERT(*data != NULL); + + if (db->db_blkid == DB_BONUS_BLKID) { + /* + * Simply copy the bonus data into the dnode. It will + * be written out when the dnode is synced (and it will + * be synced, since it must have been dirty for dbuf_sync + * to be called). The bonus data will be byte swapped + * in dnode_byteswap. + */ + /* + * Use dn_phys->dn_bonuslen since db.db_size is the length + * of the bonus buffer in the open transaction rather than + * the syncing transaction. + */ + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); + bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), + dn->dn_phys->dn_bonuslen); + if (*data != db->db_buf) + arc_buf_free(*data, db); + db->db_d.db_data_old[txg&TXG_MASK] = NULL; + db->db_data_pending = NULL; + if (db->db_dirtied == txg) + db->db_dirtied = 0; + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_remove_ref(db, (void *)(uintptr_t)txg); + return; + } else if (db->db_level > 0 && !arc_released(db->db_buf)) { + /* + * This indirect buffer was marked dirty, but + * never modified (if it had been modified, then + * we would have released the buffer). There is + * no reason to write anything. + */ + db->db_data_pending = NULL; + if (db->db_dirtied == txg) + db->db_dirtied = 0; + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_remove_ref(db, (void *)(uintptr_t)txg); + return; + } else if (db->db_blkptr == NULL && + db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid < dn->dn_phys->dn_nblkptr) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + dbuf_verify(db); + mutex_exit(&db->db_mtx); + } else if (db->db_blkptr == NULL) { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + mutex_exit(&db->db_mtx); + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, NULL, &parent); + rw_exit(&dn->dn_struct_rwlock); + dbuf_add_ref(parent, db); + db->db_parent = parent; + dbuf_rele(parent); + } + dbuf_read(parent); + } else { + mutex_exit(&db->db_mtx); + } + + ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); + + if (db->db_parent != dn->dn_dbuf) { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + mutex_enter(&db->db_mtx); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); + /* + * We may have read this block after we dirtied it, + * so never released it from the cache. + */ + arc_release(parent->db_buf, parent); + + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + dbuf_verify(db); + mutex_exit(&db->db_mtx); + } + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + +#ifdef ZFS_DEBUG + if (db->db_parent == dn->dn_dbuf) { + /* + * We don't need to dnode_setdirty(dn) because if we got + * here then the parent is already dirty. + */ + ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } +#endif + if (db->db_level == 0 && + db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { + arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; + blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; + int old_size = BP_GET_ASIZE(db->db_blkptr); + int new_size = BP_GET_ASIZE(*bpp); + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + dnode_diduse_space(dn, new_size-old_size); + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); + if (!BP_IS_HOLE(db->db_blkptr)) + dsl_dataset_block_kill(os->os_dsl_dataset, + db->db_blkptr, os->os_synctx); + + mutex_enter(&db->db_mtx); + *db->db_blkptr = **bpp; + kmem_free(*bpp, sizeof (blkptr_t)); + *bpp = NULL; + + if (*old != db->db_buf) + arc_buf_free(*old, db); + *old = NULL; + db->db_data_pending = NULL; + + cv_broadcast(&db->db_changed); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_remove_ref(db, (void *)(uintptr_t)txg); + } else { + int checksum, compress; + + if (db->db_level > 0) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + checksum = ZIO_CHECKSUM_FLETCHER_4; + compress = ZIO_COMPRESS_LZJB; + } else { + /* + * Allow dnode settings to override objset settings, + * except for metadata checksums. + */ + if (dmu_ot[dn->dn_type].ot_metadata) { + checksum = os->os_md_checksum; + compress = zio_compress_select(dn->dn_compress, + os->os_md_compress); + } else { + checksum = zio_checksum_select(dn->dn_checksum, + os->os_checksum); + compress = zio_compress_select(dn->dn_compress, + os->os_compress); + } + } +#ifdef ZFS_DEBUG + if (db->db_parent) { + ASSERT(list_link_active( + &db->db_parent->db_dirty_node[txg&TXG_MASK])); + ASSERT(db->db_parent == dn->dn_dbuf || + db->db_parent->db_level > 0); + if (dn->dn_object & DMU_PRIVATE_OBJECT || + db->db_level > 0) + ASSERT(*data == db->db_buf); + } +#endif + ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); + (void) arc_write(zio, os->os_spa, checksum, compress, txg, + db->db_blkptr, *data, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); + /* + * We can't access db after arc_write, since it could finish + * and be freed, and we have no locks on it. + */ + } +} + +struct dbuf_arg { + objset_impl_t *os; + blkptr_t bp; +}; + +static void +dbuf_do_born(void *arg) +{ + struct dbuf_arg *da = arg; + dsl_dataset_block_born(da->os->os_dsl_dataset, + &da->bp, da->os->os_synctx); + kmem_free(da, sizeof (struct dbuf_arg)); +} + +static void +dbuf_do_kill(void *arg) +{ + struct dbuf_arg *da = arg; + dsl_dataset_block_kill(da->os->os_dsl_dataset, + &da->bp, da->os->os_synctx); + kmem_free(da, sizeof (struct dbuf_arg)); +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + uint64_t txg = zio->io_txg; + uint64_t fill = 0; + int i; + int old_size, new_size; + + ASSERT3U(zio->io_error, ==, 0); + + dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); + + old_size = BP_GET_ASIZE(&zio->io_bp_orig); + new_size = BP_GET_ASIZE(zio->io_bp); + + dnode_diduse_space(dn, new_size-old_size); + + mutex_enter(&db->db_mtx); + + if (db->db_dirtied == txg) + db->db_dirtied = 0; + + if (db->db_level == 0) { + arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; + + ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (*old != db->db_buf) + arc_buf_free(*old, db); + *old = NULL; + db->db_data_pending = NULL; + + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + !BP_IS_HOLE(db->db_blkptr)) + dn->dn_phys->dn_maxblkid = db->db_blkid; + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + dnode_phys_t *dnp = db->db.db_data; + for (i = db->db.db_size >> DNODE_SHIFT; i > 0; + i--, dnp++) { + if (dnp->dn_type != DMU_OT_NONE) + fill++; + } + } else { + if (!BP_IS_HOLE(db->db_blkptr)) + fill = 1; + } + } else { + blkptr_t *bp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + } + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { + if (BP_IS_HOLE(bp)) + continue; + ASSERT3U(BP_GET_LSIZE(bp), ==, + db->db_level == 1 ? dn->dn_datablksz : + (1<<dn->dn_phys->dn_indblkshift)); + fill += bp->blk_fill; + } + } + + if (!BP_IS_HOLE(db->db_blkptr)) { + db->db_blkptr->blk_fill = fill; + BP_SET_TYPE(db->db_blkptr, dn->dn_type); + BP_SET_LEVEL(db->db_blkptr, db->db_level); + } else { + ASSERT3U(fill, ==, 0); + ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + } + + dprintf_dbuf_bp(db, db->db_blkptr, + "wrote %llu bytes to blkptr:", zio->io_size); + + ASSERT(db->db_parent == NULL || + list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + + /* We must do this after we've set the bp's type and level */ + if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), + BP_IDENTITY(&zio->io_bp_orig))) { + struct dbuf_arg *da; + da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); + da->os = os; + da->bp = *zio->io_bp; + (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); + if (!BP_IS_HOLE(&zio->io_bp_orig)) { + da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); + da->os = os; + da->bp = zio->io_bp_orig; + (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); + } + } + + dbuf_remove_ref(db, (void *)(uintptr_t)txg); +} diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c new file mode 100644 index 0000000000..14fab6d420 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -0,0 +1,1761 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dmu_zfetch.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + { byteswap_uint8_array, TRUE, "unallocated" }, + { zap_byteswap, TRUE, "object directory" }, + { byteswap_uint64_array, TRUE, "object array" }, + { byteswap_uint8_array, TRUE, "packed nvlist" }, + { byteswap_uint64_array, TRUE, "packed nvlist size" }, + { byteswap_uint64_array, TRUE, "bplist" }, + { byteswap_uint64_array, TRUE, "bplist header" }, + { byteswap_uint64_array, TRUE, "SPA space map header" }, + { byteswap_uint64_array, TRUE, "SPA space map" }, + { byteswap_uint64_array, TRUE, "ZIL intent log" }, + { dnode_buf_byteswap, TRUE, "DMU dnode" }, + { dmu_objset_byteswap, TRUE, "DMU objset" }, + { byteswap_uint64_array, TRUE, "DSL directory" }, + { zap_byteswap, TRUE, "DSL directory child map"}, + { zap_byteswap, TRUE, "DSL dataset snap map" }, + { zap_byteswap, TRUE, "DSL props" }, + { byteswap_uint64_array, TRUE, "DSL dataset" }, + { zfs_znode_byteswap, TRUE, "ZFS znode" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, FALSE, "ZFS plain file" }, + { zap_byteswap, TRUE, "ZFS directory" }, + { zap_byteswap, TRUE, "ZFS master node" }, + { zap_byteswap, TRUE, "ZFS delete queue" }, + { byteswap_uint8_array, FALSE, "zvol object" }, + { zap_byteswap, TRUE, "zvol prop" }, + { byteswap_uint8_array, FALSE, "other uint8[]" }, + { byteswap_uint64_array, FALSE, "other uint64[]" }, + { zap_byteswap, TRUE, "other ZAP" }, +}; + +static int +dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags) +{ + int i, err = 0; + dnode_t *dn; + zio_t *zio; + int canfail; + uint64_t rd_sz; + + if (numbufs == 0) + return (0); + + rd_sz = numbufs * dbp[0]->db.db_size; + ASSERT(rd_sz <= DMU_MAX_ACCESS); + + dn = dbp[0]->db_dnode; + if (flags & DB_RF_CANFAIL) { + canfail = 1; + } else { + canfail = 0; + } + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail); + + /* don't prefetch if read the read is large */ + if (rd_sz >= zfetch_array_rd_sz) { + flags |= DB_RF_NOPREFETCH; + } + + /* initiate async reads */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (i = 0; i < numbufs; i++) { + if (dbp[i]->db_state == DB_UNCACHED) + dbuf_read_impl(dbp[i], zio, flags); + } + rw_exit(&dn->dn_struct_rwlock); + err = zio_wait(zio); + + if (err) + return (err); + + /* wait for other io to complete */ + for (i = 0; i < numbufs; i++) { + mutex_enter(&dbp[i]->db_mtx); + while (dbp[i]->db_state == DB_READ || + dbp[i]->db_state == DB_FILL) + cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx); + ASSERT(dbp[i]->db_state == DB_CACHED); + mutex_exit(&dbp[i]->db_mtx); + } + + return (0); +} + +void +dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs) +{ + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + int err; + + err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED); + ASSERT(err == 0); +} + +int +dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs) +{ + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL)); +} + +dmu_buf_t * +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + + /* dataset_verify(dd); */ + + dn = dnode_hold(os->os, object, FTAG); + blkid = dbuf_whichblock(dn, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (&db->db); +} + +dmu_buf_t * +dmu_bonus_hold(objset_t *os, uint64_t object) +{ + return (dmu_bonus_hold_tag(os, object, NULL)); +} + +int +dmu_bonus_max(void) +{ + return (DN_MAX_BONUSLEN); +} + +/* + * Returns held bonus buffer if the object exists, NULL if it doesn't. + */ +dmu_buf_t * +dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + dmu_buf_impl_t *db; + + if (dn == NULL) + return (NULL); + + db = dbuf_hold_bonus(dn, tag); + /* XXX - hack: hold the first block if this is a ZAP object */ + if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + dn->dn_db0 = dbuf_hold(dn, 0); + rw_exit(&dn->dn_struct_rwlock); + } + dnode_rele(dn, FTAG); + return (&db->db); +} + +static dmu_buf_t ** +dbuf_hold_array(dnode_t *dn, + uint64_t offset, uint64_t length, int *numbufsp) +{ + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + + if (length == 0) { + if (numbufsp) + *numbufsp = 0; + return (NULL); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - + P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; + } else { + ASSERT3U(offset + length, <=, dn->dn_datablksz); + nblks = 1; + } + dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *dbuf; + dbuf = dbuf_hold(dn, blkid+i); + dbp[i] = &dbuf->db; + } + rw_exit(&dn->dn_struct_rwlock); + + if (numbufsp) + *numbufsp = nblks; + return (dbp); +} + +dmu_buf_t ** +dmu_buf_hold_array(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length, int *numbufsp) +{ + dnode_t *dn; + dmu_buf_t **dbp; + + ASSERT(length <= DMU_MAX_ACCESS); + + if (length == 0) { + if (numbufsp) + *numbufsp = 0; + return (NULL); + } + + dn = dnode_hold(os->os, object, FTAG); + dbp = dbuf_hold_array(dn, offset, length, numbufsp); + dnode_rele(dn, FTAG); + + return (dbp); +} + +void +dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dbuf_add_ref(db, tag); +} + +void +dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dbuf_remove_ref(db, tag); +} + +void +dmu_buf_rele(dmu_buf_t *dbuf_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; + + /* XXX - hack: hold the first block if this is a ZAP object */ + if (db->db_blkid == DB_BONUS_BLKID && + dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) + dbuf_rele(db->db_dnode->dn_db0); + dbuf_rele(db); +} + +void +dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; + + /* XXX - hack: hold the first block if this is a ZAP object */ + if (db->db_blkid == DB_BONUS_BLKID && + dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) + dbuf_rele(db->db_dnode->dn_db0); + dbuf_remove_ref(db, tag); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS); + + for (i = 0; i < numbufs; i++) + dbuf_rele(dbp[i]); + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +void +dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, i; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = os->os->os_meta_dnode; + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, blkid); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + dn = dnode_hold(os->os, object, FTAG); + if (dn == NULL) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - + P2ALIGN(offset, 1<<blkshift)) >> blkshift; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, offset); + for (i = 0; i < nblks; i++) + dbuf_prefetch(dn, blkid+i); + } + + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +void +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + ASSERT(offset < UINT64_MAX); + ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); +} + +static int +dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dnode_t *dn; + dmu_buf_t **dbp; + int numbufs, i; + + dn = dnode_hold(os->os, object, FTAG); + + if (dn->dn_datablkshift == 0) { + int newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + dnode_rele(dn, FTAG); + + if (size == 0) + return (0); + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int err; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs); + err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs, + flags); + if (err) { + dmu_buf_rele_array(dbp, numbufs); + return (err); + } + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + bcopy((char *)db->db_data + bufoff, buf, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs); + } + return (0); +} + +void +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf) +{ + int err; + + err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED); + ASSERT3U(err, ==, 0); +} + +int +dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf) +{ + return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL)); +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + bcopy(buf, (char *)db->db_data + bufoff, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs); +} + +#ifdef _KERNEL +int +dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + uio_t *uio, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err = 0; + + dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + /* + * XXX uiomove could block forever (eg. nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs); + return (err); +} +#endif + +struct backuparg { + dmu_replay_record_t *drr; + vnode_t *vp; + objset_t *os; + int err; +}; + +static int +dump_bytes(struct backuparg *ba, void *buf, int len) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + /* Need to compute checksum here */ + ASSERT3U(len % 8, ==, 0); + ba->err = vn_rdwr(UIO_WRITE, ba->vp, + (caddr_t)buf, len, + 0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid); + return (ba->err); +} + +static int +dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, + uint64_t length) +{ + /* write a FREE record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREE; + ba->drr->drr_u.drr_free.drr_object = object; + ba->drr->drr_u.drr_free.drr_offset = offset; + ba->drr->drr_u.drr_free.drr_length = length; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_data(struct backuparg *ba, dmu_object_type_t type, + uint64_t object, uint64_t offset, int blksz, void *data) +{ + /* write a DATA record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_WRITE; + ba->drr->drr_u.drr_write.drr_object = object; + ba->drr->drr_u.drr_write.drr_type = type; + ba->drr->drr_u.drr_write.drr_offset = offset; + ba->drr->drr_u.drr_write.drr_length = blksz; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int +dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +{ + /* write a FREEOBJECTS record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREEOBJECTS; + ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; + ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +{ + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(ba, object, 1)); + + /* write an OBJECT record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_OBJECT; + ba->drr->drr_u.drr_object.drr_object = object; + ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; + ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; + ba->drr->drr_u.drr_object.drr_blksz = + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; + ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; + ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + return (EINTR); + + /* free anything past the end of the file */ + if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + return (EINTR); + if (ba->err) + return (EINTR); + return (0); +} + +#define BP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +static int +backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + struct backuparg *ba = arg; + uint64_t object = bc->bc_bookmark.zb_object; + int level = bc->bc_bookmark.zb_level; + uint64_t blkid = bc->bc_bookmark.zb_blkid; + blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + void *data = bc->bc_data; + int err = 0; + + if (issig(JUSTLOOKING)) + return (EINTR); + + ASSERT(data || bp == NULL); + + if (bp == NULL && object == 0) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + } else if (bp == NULL) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + err = dump_free(ba, object, blkid * span, span); + } else if (data && level == 0 && type == DMU_OT_DNODE) { + dnode_phys_t *blk = data; + int i; + int blksz = BP_GET_LSIZE(bp); + + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = + (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = dump_dnode(ba, dnobj, blk+i); + if (err) + break; + } + } else if (level == 0 && + type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { + int blksz = BP_GET_LSIZE(bp); + if (data == NULL) { + arc_buf_t *abuf; + + (void) arc_read(NULL, spa, bp, + dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, + ARC_WAIT); + + if (abuf) { + err = dump_data(ba, type, object, blkid * blksz, + blksz, abuf->b_data); + arc_buf_free(abuf, &abuf); + } + } else { + err = dump_data(ba, type, object, blkid * blksz, + blksz, data); + } + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +int +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) +{ + dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dmu_replay_record_t *drr; + struct backuparg ba; + int err; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= + ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; + drr->drr_u.drr_begin.drr_creation_time = + ds->ds_phys->ds_creation_time; + drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (fromds) + drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; + dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + + ba.drr = drr; + ba.vp = vp; + ba.os = tosnap; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + err = traverse_dsl_dataset(ds, + fromds ? fromds->ds_phys->ds_creation_txg : 0, + ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, + backup_cb, &ba); + + if (err) { + if (err == EINTR && ba.err) + err = ba.err; + return (err); + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) + return (ba.err); + + kmem_free(drr, sizeof (dmu_replay_record_t)); + + return (0); +} + +struct restorearg { + int err; + int byteswap; + vnode_t *vp; + char *buf; + uint64_t voff; + int buflen; /* number of valid bytes in buf */ + int bufoff; /* next offset to read */ + int bufsize; /* amount of memory allocated for buf */ +}; + +static int +replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct drr_begin *drrb = arg; + dsl_dataset_t *ds = NULL; + dsl_dataset_t *ds_prev = NULL; + const char *snapname; + int err = EINVAL; + uint64_t val; + + /* this must be a filesytem */ + if (dd->dd_phys->dd_head_dataset_obj == 0) + goto die; + + ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, + NULL, DS_MODE_EXCLUSIVE, FTAG); + + if (ds == NULL) { + err = EBUSY; + goto die; + } + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) { + err = ENODEV; + goto die; + } + + /* most recent snapshot must match fromguid */ + ds_prev = dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); + if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) { + err = ENODEV; + goto die; + } + + /* must not have any changes since most recent snapshot */ + if (ds->ds_phys->ds_bp.blk_birth > + ds_prev->ds_phys->ds_creation_txg) { + err = ETXTBSY; + goto die; + } + + /* new snapshot name must not exist */ + snapname = strrchr(drrb->drr_toname, '@'); + if (snapname == NULL) { + err = EEXIST; + goto die; + } + snapname++; + err = zap_lookup(dd->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); + if (err != ENOENT) { + if (err == 0) + err = EEXIST; + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); + return (err); + } + + dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); + + /* The point of no (unsuccessful) return. */ + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_restoring = TRUE; + + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (0); + +die: + if (ds_prev) + dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); + if (ds) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (err); +} + +static int +replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct drr_begin *drrb = arg; + int err; + char *fsfullname, *fslastname, *cp; + dsl_dataset_t *ds; + + fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + (void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN); + cp = strchr(fsfullname, '@'); + if (cp == NULL) { + kmem_free(fsfullname, MAXNAMELEN); + return (EINVAL); + } + *cp = '\0'; + fslastname = strrchr(fsfullname, '/'); + if (fslastname == NULL) { + kmem_free(fsfullname, MAXNAMELEN); + return (EINVAL); + } + fslastname++; + + err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx); + if (err) { + kmem_free(fsfullname, MAXNAMELEN); + return (err); + } + + /* the point of no (unsuccessful) return */ + + err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname, + DS_MODE_EXCLUSIVE, FTAG, &ds); + ASSERT3U(err, ==, 0); + kmem_free(fsfullname, MAXNAMELEN); + + (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), + ds, drrb->drr_type, tx); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_restoring = TRUE; + + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (0); +} + +static int +replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct drr_begin *drrb = arg; + int err; + char *snapname; + dsl_dataset_t *ds; + + /* XXX verify that drr_toname is in dd */ + + snapname = strchr(drrb->drr_toname, '@'); + if (snapname == NULL) + return (EINVAL); + snapname++; + + /* create snapshot */ + err = dsl_dataset_snapshot_sync(dd, snapname, tx); + if (err) + return (err); + + /* set snapshot's creation time and guid */ + err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname, + DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds); + ASSERT3U(err, ==, 0); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_creation_time = drrb->drr_creation_time; + ds->ds_phys->ds_guid = drrb->drr_toguid; + ds->ds_phys->ds_restoring = FALSE; + + dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); + + ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, + NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_restoring = FALSE; + dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + + return (0); +} + +void * +restore_read(struct restorearg *ra, int len) +{ + void *rv; + + /* some things will require 8-byte alignment, so everything must */ + ASSERT3U(len % 8, ==, 0); + + while (ra->buflen - ra->bufoff < len) { + ssize_t resid; + int leftover = ra->buflen - ra->bufoff; + + (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); + ra->err = vn_rdwr(UIO_READ, ra->vp, + (caddr_t)ra->buf + leftover, ra->bufsize - leftover, + ra->voff, UIO_SYSSPACE, FAPPEND, + RLIM_INFINITY, CRED(), &resid); + + /* Need to compute checksum */ + + ra->voff += ra->bufsize - leftover - resid; + ra->buflen = ra->bufsize - resid; + ra->bufoff = 0; + if (resid == ra->bufsize - leftover) + ra->err = EINVAL; + if (ra->err) + return (NULL); + } + + ASSERT3U(ra->bufoff % 8, ==, 0); + ASSERT3U(ra->buflen - ra->bufoff, >=, len); + rv = ra->buf + ra->bufoff; + ra->bufoff += len; + return (rv); +} + +static void +backup_byteswap(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_version); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + /* DO64(drr_object.drr_allocation_txg); */ + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_length); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + break; + case DRR_END: + DO64(drr_end.drr_checksum); + break; + } +#undef DO64 +#undef DO32 +} + +static int +restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +{ + int err; + dmu_tx_t *tx; + + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + + if (drro->drr_type == DMU_OT_NONE || + drro->drr_type >= DMU_OT_NUMTYPES || + drro->drr_bonustype >= DMU_OT_NUMTYPES || + drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_bonuslen > DN_MAX_BONUSLEN) { + return (EINVAL); + } + + tx = dmu_tx_create(os); + + if (err == ENOENT) { + /* currently free, want to be allocated */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_claim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } else { + /* currently allocated, want to be allocated */ + dmu_tx_hold_bonus(tx, drro->drr_object); + /* + * We may change blocksize, so need to + * hold_write + */ + dmu_tx_hold_write(tx, drro->drr_object, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + err = dmu_object_reclaim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err) { + dmu_tx_commit(tx); + return (EINVAL); + } + + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + + if (drro->drr_bonuslen) { + dmu_buf_t *db; + void *data; + db = dmu_bonus_hold(os, drro->drr_object); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, ==, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); + if (data == NULL) { + dmu_tx_commit(tx); + return (ra->err); + } + bcopy(data, db->db_data, db->db_size); + if (ra->byteswap) { + dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db); + } + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_freeobjects(struct restorearg *ra, objset_t *os, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (EINVAL); + + for (obj = drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { + dmu_tx_t *tx; + int err; + + if (dmu_object_info(os, obj, NULL) != 0) + continue; + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_free(os, obj, tx); + dmu_tx_commit(tx); + if (err && err != ENOENT) + return (EINVAL); + } + return (0); +} + +static int +restore_write(struct restorearg *ra, objset_t *os, + struct drr_write *drrw) +{ + dmu_tx_t *tx; + void *data; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + drrw->drr_type >= DMU_OT_NUMTYPES) + return (EINVAL); + + data = restore_read(ra, drrw->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrw->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + if (ra->byteswap) + dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + dmu_write(os, drrw->drr_object, + drrw->drr_offset, drrw->drr_length, data, tx); + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_free(struct restorearg *ra, objset_t *os, + struct drr_free *drrf) +{ + dmu_tx_t *tx; + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (EINVAL); + + if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + dmu_free_range(os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length, tx); + dmu_tx_commit(tx); + return (0); +} + +int +dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, + vnode_t *vp, uint64_t voffset) +{ + struct restorearg ra; + dmu_replay_record_t *drr; + char *cp, *tosnap; + dsl_dir_t *dd = NULL; + objset_t *os = NULL; + + bzero(&ra, sizeof (ra)); + ra.vp = vp; + ra.voff = voffset; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + + if (drrb->drr_magic == DMU_BACKUP_MAGIC) { + ra.byteswap = FALSE; + } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + ra.byteswap = TRUE; + } else { + ra.err = EINVAL; + goto out; + } + + if (ra.byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + + tosnap = drrb->drr_toname; + if (drrb->drr_version != DMU_BACKUP_VERSION || + drrb->drr_type >= DMU_OST_NUMTYPES || + strchr(drrb->drr_toname, '@') == NULL) { + ra.err = EINVAL; + goto out; + } + + /* + * Process the begin in syncing context. + */ + if (drrb->drr_fromguid) { + /* incremental backup */ + + cp = strchr(tosnap, '@'); + *cp = '\0'; + dd = dsl_dir_open(tosnap, FTAG, NULL); + *cp = '@'; + if (dd == NULL) { + ra.err = ENOENT; + goto out; + } + + ra.err = dsl_dir_sync_task(dd, replay_incremental_sync, + drrb, 1<<20); + } else { + /* full backup */ + const char *tail; + + cp = strchr(tosnap, '@'); + *cp = '\0'; + dd = dsl_dir_open(tosnap, FTAG, &tail); + *cp = '@'; + if (dd == NULL) { + ra.err = ENOENT; + goto out; + } + if (tail == NULL) { + ra.err = EEXIST; + goto out; + } + + ra.err = dsl_dir_sync_task(dd, replay_full_sync, + drrb, 1<<20); + } + if (ra.err) + goto out; + + /* + * Open the objset we are modifying. + */ + + cp = strchr(tosnap, '@'); + *cp = '\0'; + ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, + DS_MODE_PRIMARY | DS_MODE_RESTORE, &os); + *cp = '@'; + ASSERT3U(ra.err, ==, 0); + + /* + * Read records and process them. + */ + while (ra.err == 0 && + NULL != (drr = restore_read(&ra, sizeof (*drr)))) { + if (issig(JUSTLOOKING)) { + ra.err = EINTR; + goto out; + } + + if (ra.byteswap) + backup_byteswap(drr); + + switch (drr->drr_type) { + case DRR_OBJECT: + { + /* + * We need to make a copy of the record header, + * because restore_{object,write} may need to + * restore_read(), which will invalidate drr. + */ + struct drr_object drro = drr->drr_u.drr_object; + ra.err = restore_object(&ra, os, &drro); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects drrfo = + drr->drr_u.drr_freeobjects; + ra.err = restore_freeobjects(&ra, os, &drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write drrw = drr->drr_u.drr_write; + ra.err = restore_write(&ra, os, &drrw); + break; + } + case DRR_FREE: + { + struct drr_free drrf = drr->drr_u.drr_free; + ra.err = restore_free(&ra, os, &drrf); + break; + } + case DRR_END: + /* Need to verify checksum. */ + /* + * dd may be the parent of the dd we are + * restoring into (eg. if it's a full backup). + */ + ra.err = dsl_dir_sync_task(dmu_objset_ds(os)-> + ds_dir, replay_end_sync, drrb, 1<<20); + goto out; + default: + ra.err = EINVAL; + goto out; + } + } + +out: + if (os) + dmu_objset_close(os); + + /* + * Make sure we don't rollback/destroy unless we actually + * processed the begin properly. 'os' will only be set if this + * is the case. + */ + if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) { + /* + * rollback or destroy what we created, so we don't + * leave it in the restoring state. + */ + txg_wait_synced(dd->dd_pool, 0); + if (drrb->drr_fromguid) { + /* incremental: rollback to most recent snapshot */ + (void) dsl_dir_sync_task(dd, + dsl_dataset_rollback_sync, NULL, 0); + } else { + /* full: destroy whole fs */ + cp = strchr(tosnap, '@'); + *cp = '\0'; + cp = strchr(tosnap, '/'); + if (cp) { + (void) dsl_dir_sync_task(dd, + dsl_dir_destroy_sync, cp+1, 0); + } + cp = strchr(tosnap, '\0'); + *cp = '@'; + } + + } + + if (dd) + dsl_dir_close(dd, FTAG); + kmem_free(ra.buf, ra.bufsize); + if (sizep) + *sizep = ra.voff; + return (ra.err); +} + +/* + * Intent log support: sync the block at <os, object, offset> to disk. + * N.B. and XXX: the caller is responsible for serializing dmu_sync()s + * of the same block, and for making sure that the data isn't changing + * while dmu_sync() is writing it. + * + * Return values: + * + * EALREADY: this txg has already been synced, so there's nothing to to. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EINPROGRESS: the block is in the process of being synced by the + * usual mechanism (spa_sync()), so we can't sync it here. + * The caller should txg_wait_synced() and not log the write. + * + * EBUSY: another thread is trying to dmu_sync() the same dbuf. + * (This case cannot arise under the current locking rules.) + * The caller should txg_wait_synced() and not log the write. + * + * ESTALE: the block was dirtied or freed while we were writing it, + * so the data is no longer valid. + * The caller should txg_wait_synced() and not log the write. + * + * 0: success. Sets *bp to the blkptr just written, and sets + * *blkoff to the data's offset within that block. + * The caller should log this blkptr/blkoff in its lr_write_t. + */ +int +dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, + blkptr_t *bp, uint64_t txg) +{ + dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool; + tx_state_t *tx = &dp->dp_tx; + dmu_buf_impl_t *db; + blkptr_t *blk; + int err; + + ASSERT(RW_LOCK_HELD(&tx->tx_suspend)); + ASSERT(BP_IS_HOLE(bp)); + ASSERT(txg != 0); + + dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", + txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); + + /* + * If this txg already synced, there's nothing to do. + */ + if (txg <= tx->tx_synced_txg) { + /* + * If we're running ziltest, we need the blkptr regardless. + */ + if (txg > spa_freeze_txg(dp->dp_spa)) { + db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); + /* if db_blkptr == NULL, this was an empty write */ + if (db->db_blkptr) + *bp = *db->db_blkptr; /* structure assignment */ + else + bzero(bp, sizeof (blkptr_t)); + *blkoff = offset - db->db.db_offset; + ASSERT3U(*blkoff, <, db->db.db_size); + dmu_buf_rele((dmu_buf_t *)db); + return (0); + } + return (EALREADY); + } + + /* + * If this txg is in the middle of syncing, just wait for it. + */ + if (txg == tx->tx_syncing_txg) { + ASSERT(txg != tx->tx_open_txg); + return (EINPROGRESS); + } + + db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); + + mutex_enter(&db->db_mtx); + + /* + * If this dbuf isn't dirty, must have been free_range'd. + * There's no need to log writes to freed blocks, so we're done. + */ + if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) { + mutex_exit(&db->db_mtx); + dmu_buf_rele((dmu_buf_t *)db); + return (ENOENT); + } + + blk = db->db_d.db_overridden_by[txg&TXG_MASK]; + + /* + * If we already did a dmu_sync() of this dbuf in this txg, + * free the old block before writing the new one. + */ + if (blk != NULL) { + ASSERT(blk != IN_DMU_SYNC); + if (blk == IN_DMU_SYNC) { + mutex_exit(&db->db_mtx); + dmu_buf_rele((dmu_buf_t *)db); + return (EBUSY); + } + arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); + if (!BP_IS_HOLE(blk)) { + (void) arc_free(NULL, os->os->os_spa, txg, blk, + NULL, NULL, ARC_WAIT); + } + kmem_free(blk, sizeof (blkptr_t)); + } + + db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + blk->blk_birth = 0; /* mark as invalid */ + + err = arc_write(NULL, os->os->os_spa, + zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum), + zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress), + txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + ASSERT(err == 0); + + if (!BP_IS_HOLE(blk)) { + blk->blk_fill = 1; + BP_SET_TYPE(blk, db->db_dnode->dn_type); + BP_SET_LEVEL(blk, 0); + } + + /* copy the block pointer back to caller */ + *bp = *blk; /* structure assignment */ + *blkoff = offset - db->db.db_offset; + ASSERT3U(*blkoff, <, db->db.db_size); + + mutex_enter(&db->db_mtx); + if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) { + /* we were dirtied/freed during the sync */ + ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL); + arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); + mutex_exit(&db->db_mtx); + dmu_buf_rele((dmu_buf_t *)db); + /* Note that this block does not free on disk until txg syncs */ + + /* + * XXX can we use ARC_NOWAIT here? + * XXX should we be ignoring the return code? + */ + if (!BP_IS_HOLE(blk)) { + (void) arc_free(NULL, os->os->os_spa, txg, blk, + NULL, NULL, ARC_WAIT); + } + kmem_free(blk, sizeof (blkptr_t)); + return (ESTALE); + } + + db->db_d.db_overridden_by[txg&TXG_MASK] = blk; + mutex_exit(&db->db_mtx); + dmu_buf_rele((dmu_buf_t *)db); + ASSERT3U(txg, >, tx->tx_syncing_txg); + return (0); +} + +uint64_t +dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + uint64_t rv = dnode_max_nonzero_offset(dn); + dnode_rele(dn, FTAG); + return (rv); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + int err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int i, err; + + dn = dnode_hold(os->os, object, FTAG); + /* + * Sync any current changes before + * we go trundling through the block pointers. + */ + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_dirtyblksz[i]) + break; + } + if (i != TXG_SIZE) { + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + dn = dnode_hold(os->os, object, FTAG); + } + + err = dnode_next_offset(dn, hole, off, 1, 1); + dnode_rele(dn, FTAG); + + return (err); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_physical_blks = dn->dn_phys->dn_secphys; + doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; + doi->doi_type = dn->dn_type; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_bonus_type = dn->dn_bonustype; + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn = dnode_hold(os->os, object, FTAG); + + if (dn == NULL) + return (ENOENT); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +{ + dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); +} + +/* + * Faster still when you only care about the size. + * This is specifically optimized for zfs_getattr(). + */ +void +dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + *blksize = dn->dn_datablksz; + *nblk512 = dn->dn_phys->dn_secphys + 1; /* add 1 for dnode space */ +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + dbuf_init(); + dnode_init(); + arc_init(); +} + +void +dmu_fini(void) +{ + arc_fini(); + dnode_fini(); + dbuf_fini(); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c new file mode 100644 index 0000000000..d150d6c400 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dnode.h> + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + objset_impl_t *osi = os->os; + uint64_t object; + uint64_t L2_dnode_count = DNODES_PER_BLOCK << + (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn; + int restarted = B_FALSE; + + mutex_enter(&osi->os_obj_lock); + for (;;) { + object = osi->os_obj_next; + /* + * Each time we polish off an L2 bp worth of dnodes + * (2^13 objects), move to another L2 bp that's still + * reasonably sparse (at most 1/4 full). Look from the + * beginning once, but after that keep looking from here. + * If we can't find one, just keep going from here. + */ + if (P2PHASE(object, L2_dnode_count) == 0) { + uint64_t offset = restarted ? object << DNODE_SHIFT : 0; + int error = dnode_next_offset(osi->os_meta_dnode, + B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2); + restarted = B_TRUE; + if (error == 0) + object = offset >> DNODE_SHIFT; + } + osi->os_obj_next = ++object; + + dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG); + if (dn) + break; + + if (dmu_object_next(os, &object, B_TRUE) == 0) + osi->os_obj_next = object - 1; + } + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + mutex_exit(&osi->os_obj_lock); + + dmu_tx_add_new_object(tx, os, object); + return (object); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + + if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx)) + return (EBADF); + + dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG); + if (dn == NULL) + return (EEXIST); + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + dmu_tx_add_new_object(tx, os, object); + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dnode_t *dn; + + if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx)) + return (EBADF); + + dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG); + if (dn == NULL) + return (EBADF); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + + ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx)); + + dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG); + if (dn == NULL) + return (ENOENT); + + ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole) +{ + uint64_t offset = (*objectp + 1) << DNODE_SHIFT; + int error; + + error = dnode_next_offset(os->os->os_meta_dnode, + hole, &offset, 0, DNODES_PER_BLOCK); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c new file mode 100644 index 0000000000..9bb621b9a1 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -0,0 +1,727 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/zio_checksum.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dmu_impl.h> + + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); +} + +objset_impl_t * +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) +{ + objset_impl_t *winner, *osi; + int i, err, checksum; + + osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); + osi->os.os = osi; + osi->os_dsl_dataset = ds; + osi->os_spa = spa; + if (bp) + osi->os_rootbp = *bp; + osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t)); + if (!BP_IS_HOLE(&osi->os_rootbp)) { + dprintf_bp(&osi->os_rootbp, "reading %s", ""); + (void) arc_read(NULL, spa, &osi->os_rootbp, + dmu_ot[DMU_OT_OBJSET].ot_byteswap, + arc_bcopy_func, osi->os_phys, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + } else { + bzero(osi->os_phys, sizeof (objset_phys_t)); + } + osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). + */ + if (ds) { + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, osi); + ASSERT(err == 0); + + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + ASSERT(err == 0); + } else { + /* It's the meta-objset. */ + osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + osi->os_compress = ZIO_COMPRESS_LZJB; + } + + /* + * Metadata always gets compressed and checksummed. + * If the data checksum is multi-bit correctable, and it's not + * a ZBT-style checksum, then it's suitable for metadata as well. + * Otherwise, the metadata checksum defaults to fletcher4. + */ + checksum = osi->os_checksum; + + if (zio_checksum_table[checksum].ci_correctable && + !zio_checksum_table[checksum].ci_zbt) + osi->os_md_checksum = checksum; + else + osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; + + osi->os_md_compress = ZIO_COMPRESS_LZJB; + + for (i = 0; i < TXG_SIZE; i++) { + list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i])); + } + list_create(&osi->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + osi->os_meta_dnode = dnode_special_open(osi, + &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + + if (ds != NULL) { + winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict); + if (winner) { + dmu_objset_evict(ds, osi); + osi = winner; + } + } + + return (osi); +} + +/* called from zpl */ +int +dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + objset_t *os; + objset_impl_t *osi; + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + err = dsl_dataset_open(name, mode, os, &ds); + if (err) { + kmem_free(os, sizeof (objset_t)); + return (err); + } + + osi = dsl_dataset_get_user_ptr(ds); + if (osi == NULL) { + blkptr_t bp; + + dsl_dataset_get_blkptr(ds, &bp); + osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp); + } + + os->os = osi; + os->os_mode = mode; + + if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) { + dmu_objset_close(os); + return (EINVAL); + } + *osp = os; + return (0); +} + +void +dmu_objset_close(objset_t *os) +{ + dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os); + kmem_free(os, sizeof (objset_t)); +} + +void +dmu_objset_evict(dsl_dataset_t *ds, void *arg) +{ + objset_impl_t *osi = arg; + int err, i; + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); + ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); + } + + if (ds) { + err = dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, osi); + ASSERT(err == 0); + + err = dsl_prop_unregister(ds, "compression", + compression_changed_cb, osi); + ASSERT(err == 0); + } + + ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); + ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); + + dnode_special_close(osi->os_meta_dnode); + zil_free(osi->os_zil); + + zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); + kmem_free(osi, sizeof (objset_impl_t)); +} + +/* called from dsl for meta-objset */ +objset_impl_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type, + dmu_tx_t *tx) +{ + objset_impl_t *osi; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + osi = dmu_objset_open_impl(spa, ds, NULL); + mdn = osi->os_meta_dnode; + + dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, + DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = DN_META_DNODE_LEVELS; + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + osi->os_phys->os_type = type; + + dsl_dataset_dirty(ds, tx); + + return (osi); +} + +struct oscarg { + void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + void *userarg; + dsl_dataset_t *clone_parent; + const char *fullname; + const char *lastname; + dmu_objset_type_t type; +}; + +static int +dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct oscarg *oa = arg; + dsl_dataset_t *ds; + int err; + blkptr_t bp; + + ASSERT(dmu_tx_is_syncing(tx)); + + err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname, + oa->clone_parent, tx); + dprintf_dd(dd, "fn=%s ln=%s err=%d\n", + oa->fullname, oa->lastname, err); + if (err) + return (err); + + err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname, + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds); + ASSERT3U(err, ==, 0); + dsl_dataset_get_blkptr(ds, &bp); + if (BP_IS_HOLE(&bp)) { + objset_impl_t *osi; + + /* This is an empty dmu_objset; not a clone. */ + osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), + ds, oa->type, tx); + + if (oa->userfunc) + oa->userfunc(&osi->os, oa->userarg, tx); + } + dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); + + return (0); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg) +{ + dsl_dir_t *pds; + const char *tail; + int err = 0; + + pds = dsl_dir_open(name, FTAG, &tail); + if (pds == NULL) + return (ENOENT); + if (tail == NULL) { + dsl_dir_close(pds, FTAG); + return (EEXIST); + } + + dprintf("name=%s\n", name); + + if (tail[0] == '@') { + /* + * If we're creating a snapshot, make sure everything + * they might want is on disk. XXX Sketchy to know + * about snapshots here, better to put in DSL. + */ + objset_t *os; + size_t plen = strchr(name, '@') - name + 1; + char *pbuf = kmem_alloc(plen, KM_SLEEP); + bcopy(name, pbuf, plen - 1); + pbuf[plen - 1] = '\0'; + + err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os); + if (err == 0) { + err = zil_suspend(dmu_objset_zil(os)); + if (err == 0) { + err = dsl_dir_sync_task(pds, + dsl_dataset_snapshot_sync, + (void*)(tail+1), 16*1024); + zil_resume(dmu_objset_zil(os)); + } + dmu_objset_close(os); + } + kmem_free(pbuf, plen); + } else { + struct oscarg oa = { 0 }; + oa.userfunc = func; + oa.userarg = arg; + oa.fullname = name; + oa.lastname = tail; + oa.type = type; + if (clone_parent != NULL) { + /* + * You can't clone to a different type. + */ + if (clone_parent->os->os_phys->os_type != type) { + dsl_dir_close(pds, FTAG); + return (EINVAL); + } + oa.clone_parent = clone_parent->os->os_dsl_dataset; + } + err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa, + 256*1024); + } + dsl_dir_close(pds, FTAG); + return (err); +} + +int +dmu_objset_destroy(const char *name) +{ + objset_t *os; + int error; + + /* + * If it looks like we'll be able to destroy it, and there's + * an unplayed replay log sitting around, destroy the log. + * It would be nicer to do this in dsl_dataset_destroy_sync(), + * but the replay log objset is modified in open context. + */ + error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); + if (error == 0) { + zil_destroy(dmu_objset_zil(os)); + dmu_objset_close(os); + } + + /* XXX uncache everything? */ + return (dsl_dataset_destroy(name)); +} + +int +dmu_objset_rollback(const char *name) +{ + int err; + objset_t *os; + + err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); + if (err == 0) { + err = zil_suspend(dmu_objset_zil(os)); + if (err == 0) + zil_resume(dmu_objset_zil(os)); + dmu_objset_close(os); + if (err == 0) { + /* XXX uncache everything? */ + err = dsl_dataset_rollback(name); + } + } + return (err); +} + +static void +dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx) +{ + dnode_t *dn = list_head(list); + int level, err; + + for (level = 0; dn = list_head(list); level++) { + zio_t *zio; + zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + + ASSERT3U(level, <=, DN_MAX_LEVELS); + + while (dn) { + dnode_t *next = list_next(list, dn); + + list_remove(list, dn); + if (dnode_sync(dn, level, zio, tx) == 0) { + /* + * This dnode requires syncing at higher + * levels; put it back onto the list. + */ + if (next) + list_insert_before(list, next, dn); + else + list_insert_tail(list, dn); + } + dn = next; + } + err = zio_wait(zio); + ASSERT(err == 0); + } +} + +/* ARGSUSED */ +static void +killer(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + objset_impl_t *os = arg; + objset_phys_t *osphys = zio->io_data; + dnode_phys_t *dnp = &osphys->os_meta_dnode; + int i; + + ASSERT3U(zio->io_error, ==, 0); + + /* + * Update rootbp fill count. + */ + os->os_rootbp.blk_fill = 1; /* count the meta-dnode */ + for (i = 0; i < dnp->dn_nblkptr; i++) + os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill; + + BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); + BP_SET_LEVEL(zio->io_bp, 0); + + if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), + BP_IDENTITY(&zio->io_bp_orig))) { + dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig, + os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, + os->os_synctx); + } +} + + +/* called from dsl */ +void +dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx) +{ + extern taskq_t *dbuf_tq; + int txgoff; + list_t *dirty_list; + int err; + arc_buf_t *abuf = + arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(os->os_synctx == NULL); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + txgoff = tx->tx_txg & TXG_MASK; + + dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx); + dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx); + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + + /* + * Sync meta-dnode + */ + dirty_list = &os->os_dirty_dnodes[txgoff]; + ASSERT(list_head(dirty_list) == NULL); + list_insert_tail(dirty_list, os->os_meta_dnode); + dmu_objset_sync_dnodes(os, dirty_list, tx); + + /* + * Sync the root block. + */ + bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t)); + err = arc_write(NULL, os->os_spa, os->os_md_checksum, + os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + ASSERT(err == 0); + arc_buf_free(abuf, FTAG); + + dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx); + + ASSERT3P(os->os_synctx, ==, tx); + taskq_wait(dbuf_tq); + os->os_synctx = NULL; +} + +void +dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds) +{ + if (os->os->os_dsl_dataset != NULL) { + dsl_dataset_stats(os->os->os_dsl_dataset, dds); + } else { + ASSERT(os->os->os_phys->os_type == DMU_OST_META); + bzero(dds, sizeof (*dds)); + } + dds->dds_type = os->os->os_phys->os_type; +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + else + return (B_FALSE); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) + return (ENOENT); + + if (strlen(attr.za_name) + 1 > namelen) + return (ENAMETOOLONG); + + (void) strcpy(name, attr.za_name); + *id = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + + return (0); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + */ +void +dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + objset_t *os; + uint64_t snapobj; + zap_cursor_t zc; + zap_attribute_t attr; + char *child; + int do_self; + + dd = dsl_dir_open(name, FTAG, NULL); + if (dd == NULL) + return; + + do_self = (dd->dd_phys->dd_head_dataset_obj != 0); + + /* + * Iterate over all children. + */ + if (dd->dd_phys->dd_child_dir_zapobj != 0) { + for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr.za_integer_length == sizeof (uint64_t)); + ASSERT(attr.za_num_integers == 1); + + /* + * No separating '/' because parent's name ends in /. + */ + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + /* XXX could probably just use name here */ + dsl_dir_name(dd, child); + (void) strcat(child, "/"); + (void) strcat(child, attr.za_name); + dmu_objset_find(child, func, arg, flags); + kmem_free(child, MAXPATHLEN); + } + } + + /* + * Iterate over all snapshots. + */ + if ((flags & DS_FIND_SNAPSHOTS) && + dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { + + snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj; + dmu_objset_close(os); + + for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr.za_integer_length == sizeof (uint64_t)); + ASSERT(attr.za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + /* XXX could probably just use name here */ + dsl_dir_name(dd, child); + (void) strcat(child, "@"); + (void) strcat(child, attr.za_name); + func(child, arg); + kmem_free(child, MAXPATHLEN); + } + } + + dsl_dir_close(dd, FTAG); + + /* + * Apply to self if appropriate. + */ + if (do_self) + func(name, arg); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c new file mode 100644 index 0000000000..036e3965cf --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -0,0 +1,792 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_traverse.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_pool.h> +#include <sys/dnode.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_impl.h> + +#define BP_SPAN_SHIFT(level, width) ((level) * (width)) + +#define BP_EQUAL(b1, b2) \ + (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ + (b1)->blk_birth == (b2)->blk_birth) + +/* + * Compare two bookmarks. + * + * For ADVANCE_PRE, the visitation order is: + * + * objset 0, 1, 2, ..., ZB_MAXOBJSET. + * object 0, 1, 2, ..., ZB_MAXOBJECT. + * blkoff 0, 1, 2, ... + * level ZB_MAXLEVEL, ..., 2, 1, 0. + * + * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid + * ordering vector is: + * + * < objset, object, blkoff, -level > + * + * For ADVANCE_POST, the starting offsets aren't sequential but ending + * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. + * The visitation order is: + * + * objset 1, 2, ..., ZB_MAXOBJSET, 0. + * object 1, 2, ..., ZB_MAXOBJECT, 0. + * blkoff 1, 2, ... + * level 0, 1, 2, ..., ZB_MAXLEVEL. + * + * and thus a valid ordering vector is: + * + * < objset - 1, object - 1, blkoff, level > + * + * Both orderings can be expressed as: + * + * < objset + bias, object + bias, blkoff, level ^ bias > + * + * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) + * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). + * + * Special case: an objset's osphys is represented as level -1 of object 0. + * It is always either the very first or very last block we visit in an objset. + * Therefore, if either bookmark's level is -1, level alone determines order. + */ +static int +compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, + int advance) +{ + int bias = (advance & ADVANCE_PRE) ? 0 : -1; + uint64_t sblkoff, eblkoff; + int slevel, elevel, wshift; + + if (szb->zb_objset + bias < ezb->zb_objset + bias) + return (-1); + + if (szb->zb_objset + bias > ezb->zb_objset + bias) + return (1); + + slevel = szb->zb_level; + elevel = ezb->zb_level; + + if ((slevel | elevel) < 0) + return ((slevel ^ bias) - (elevel ^ bias)); + + if (szb->zb_object + bias < ezb->zb_object + bias) + return (-1); + + if (szb->zb_object + bias > ezb->zb_object + bias) + return (1); + + if (dnp == NULL) + return (0); + + wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + + sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); + eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); + + if (sblkoff < eblkoff) + return (-1); + + if (sblkoff > eblkoff) + return (1); + + return ((elevel ^ bias) - (slevel ^ bias)); +} + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define SET_BOOKMARK_LB(zb, level, blkid) \ +{ \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +static int +advance_objset(zseg_t *zseg, uint64_t objset, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (objset >= ZB_MAXOBJSET) + return (ERANGE); + SET_BOOKMARK(zb, objset, 0, -1, 0); + } else { + if (objset >= ZB_MAXOBJSET) + objset = 0; + SET_BOOKMARK(zb, objset, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_object(zseg_t *zseg, uint64_t object, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + if (advance & ADVANCE_PRE) { + if (object >= ZB_MAXOBJECT) { + SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); + } else { + SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); + } + } else { + if (zb->zb_object == 0) { + SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); + } else { + if (object >= ZB_MAXOBJECT) + object = 0; + SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); + } + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_from_osphys(zseg_t *zseg, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + + ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_level == -1); + ASSERT(zb->zb_blkid == 0); + + if (advance & ADVANCE_PRE) { + SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); + } else { + if (zb->zb_objset == 0) + return (ERANGE); + SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); + } + + if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) +{ + zbookmark_t *zb = &zseg->seg_start; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int maxlevel = dnp->dn_nlevels - 1; + int level = zb->zb_level; + uint64_t blkid = zb->zb_blkid; + + if (advance & ADVANCE_PRE) { + if (level > 0 && rc == 0) { + level--; + blkid <<= wshift; + } else { + blkid++; + + if ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) + return (ERANGE); + + while (level < maxlevel) { + if (P2PHASE(blkid, 1ULL << wshift)) + break; + blkid >>= wshift; + level++; + } + } + } else { + if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { + blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); + level = 0; + } else { + blkid >>= wshift; + level++; + } + + while ((blkid << BP_SPAN_SHIFT(level, wshift)) > + dnp->dn_maxblkid) { + if (level == maxlevel) + return (ERANGE); + blkid >>= wshift; + level++; + } + } + SET_BOOKMARK_LB(zb, level, blkid); + + if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) + return (ERANGE); + + return (EAGAIN); +} + +static int +traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) +{ + /* + * Before we issue the callback, prune against maxtxg. + * + * We prune against mintxg before we get here because it's a big win. + * If a given block was born in txg 37, then we know that the entire + * subtree below that block must have been born in txg 37 or earlier. + * We can therefore lop off huge branches of the tree as we go. + * + * There's no corresponding optimization for maxtxg because knowing + * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's + * children. In fact, the copy-on-write design of ZFS ensures that + * top-level blocks will pretty much always be new. + * + * Therefore, in the name of simplicity we don't prune against + * maxtxg until the last possible moment -- that being right now. + */ + if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) + return (0); + + if (bc->bc_errno == 0) { + zbookmark_t *zb = &bc->bc_bookmark; + zbookmark_t *szb = &zseg->seg_start; + zbookmark_t *ezb = &zseg->seg_end; + zbookmark_t *lzb = &th->th_lastcb; + dnode_phys_t *dnp = bc->bc_dnode; + + /* + * Debugging: verify that the order we visit things + * agrees with the order defined by compare_bookmark(). + */ + ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); + ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); + ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || + lzb->zb_level == ZB_NO_LEVEL); + *lzb = *zb; + } + + th->th_callbacks++; + return (th->th_func(bc, th->th_spa, th->th_arg)); +} + +static int +traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, + dnode_phys_t *dnp) +{ + zbookmark_t *zb = &bc->bc_bookmark; + int error; + + th->th_hits++; + + bc->bc_dnode = dnp; + bc->bc_errno = 0; + + if (BP_EQUAL(&bc->bc_blkptr, bp)) + return (0); + + bc->bc_blkptr = *bp; + + if (bc->bc_data == NULL) + return (0); + + if (BP_IS_HOLE(bp)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + return (0); + } + + if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { + error = EIO; + } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { + error = 0; + th->th_arc_hits++; + } else { + error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, + BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, + th->th_zio_flags | ZIO_FLAG_DONT_CACHE)); + + if (BP_SHOULD_BYTESWAP(bp) && error == 0) + (zb->zb_level > 0 ? byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, + BP_GET_LSIZE(bp)); + th->th_reads++; + } + + if (error) { + bc->bc_errno = error; + error = traverse_callback(th, NULL, bc); + ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); + bc->bc_blkptr.blk_birth = -1ULL; + } + + dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", + bc - &th->th_cache[0][0], error, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + return (error); +} + +static int +find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + blkptr_t *bp = dnp->dn_blkptr; + int i, first, level; + int nbp = dnp->dn_nblkptr; + int minlevel = zb->zb_level; + int maxlevel = dnp->dn_nlevels - 1; + int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; + int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); + uint64_t blkid = zb->zb_blkid >> bp_shift; + int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; + int rc; + + if (minlevel > maxlevel || blkid >= nbp) + return (ERANGE); + + for (level = maxlevel; level >= minlevel; level--) { + first = P2PHASE(blkid, 1ULL << wshift); + + for (i = first; i < nbp; i++) + if (bp[i].blk_birth > zseg->seg_mintxg || + BP_IS_HOLE(&bp[i]) && do_holes) + break; + + if (i != first) { + i--; + SET_BOOKMARK_LB(zb, level, blkid + (i - first)); + return (ENOTBLK); + } + + bc = &th->th_cache[depth][level]; + + SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, + level, blkid); + + if (rc = traverse_read(th, bc, bp + i, dnp)) { + if (rc != EAGAIN) { + SET_BOOKMARK_LB(zb, level, blkid); + } + return (rc); + } + + if (BP_IS_HOLE(&bp[i])) { + SET_BOOKMARK_LB(zb, level, blkid); + th->th_lastcb.zb_level = ZB_NO_LEVEL; + return (0); + } + + nbp = 1 << wshift; + bp = bc->bc_data; + bp_shift -= wshift; + blkid = zb->zb_blkid >> bp_shift; + } + + return (0); +} + +static int +get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, + uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) +{ + zseg_t zseg; + zbookmark_t *zb = &zseg.seg_start; + uint64_t object = *objectp; + int i, rc; + + SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); + SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); + + zseg.seg_mintxg = txg; + zseg.seg_maxtxg = -1ULL; + + for (;;) { + rc = find_block(th, &zseg, mdn, depth); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0 && zb->zb_level == 0) { + dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; + for (i = 0; i < DNODES_PER_BLOCK; i++) { + object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; + if (object >= *objectp && + dnp[i].dn_type != DMU_OT_NONE && + (type == -1 || dnp[i].dn_type == type)) { + *objectp = object; + *dnpp = &dnp[i]; + return (0); + } + } + } + + rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); + + if (rc == ERANGE) + break; + } + + if (rc == ERANGE) + *objectp = ZB_MAXOBJECT; + + return (rc); +} + +static int +traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) +{ + zbookmark_t *zb = &zseg->seg_start; + traverse_blk_cache_t *bc; + dnode_phys_t *dn, *dn_tmp; + int worklimit = 1000; + int rc; + + dprintf("<%llu, %llu, %d, %llx>\n", + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); + + bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); + + rc = traverse_read(th, bc, mosbp, dn); + + if (rc) /* If we get ERESTART, we've got nowhere left to go */ + return (rc == ERESTART ? EINTR : rc); + + ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); + + if (zb->zb_objset != 0) { + uint64_t objset = zb->zb_objset; + dsl_dataset_phys_t *dsp; + + rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, + DMU_OT_DSL_OBJSET, ZB_MOS_CACHE); + + if (objset != zb->zb_objset) + rc = advance_objset(zseg, objset, th->th_advance); + + if (rc != 0) + return (rc); + + dsp = DN_BONUS(dn_tmp); + + bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; + dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; + + SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); + + rc = traverse_read(th, bc, &dsp->ds_bp, dn); + + if (rc != 0) { + if (rc == ERESTART) + rc = advance_objset(zseg, zb->zb_objset + 1, + th->th_advance); + return (rc); + } + + if (th->th_advance & ADVANCE_PRUNE) + zseg->seg_mintxg = + MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); + } + + if (zb->zb_level == -1) { + ASSERT(zb->zb_object == 0); + + if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + } + + return (advance_from_osphys(zseg, th->th_advance)); + } + + if (zb->zb_object != 0) { + uint64_t object = zb->zb_object; + + rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, + zseg->seg_mintxg, -1, ZB_MDN_CACHE); + + if (object != zb->zb_object) + rc = advance_object(zseg, object, th->th_advance); + + if (rc != 0) + return (rc); + + dn = dn_tmp; + } + + if (zb->zb_level == ZB_MAXLEVEL) + zb->zb_level = dn->dn_nlevels - 1; + + for (;;) { + rc = find_block(th, zseg, dn, ZB_DN_CACHE); + + if (rc == EAGAIN || rc == EINTR || rc == ERANGE) + break; + + if (rc == 0) { + bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; + ASSERT(bc->bc_dnode == dn); + ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); + rc = traverse_callback(th, zseg, bc); + if (rc) { + ASSERT(rc == EINTR); + return (rc); + } + if (BP_IS_HOLE(&bc->bc_blkptr)) { + ASSERT(th->th_advance & ADVANCE_HOLES); + rc = ENOTBLK; + } + } + + rc = advance_block(zseg, dn, rc, th->th_advance); + + if (rc == ERANGE) + break; + + /* + * Give spa_sync() a chance to run. + */ + if (spa_traverse_wanted(th->th_spa)) { + th->th_syncs++; + return (EAGAIN); + } + + if (--worklimit == 0) + return (EAGAIN); + } + + if (rc == ERANGE) + rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); + + return (rc); +} + +/* + * It is the caller's responsibility to ensure that the dsl_dataset_t + * doesn't go away during traversal. + */ +int +traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, + blkptr_cb_t func, void *arg) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + traverse_handle_t *th; + int err; + + th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); + + traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); + + while ((err = traverse_more(th)) == EAGAIN) + continue; + + traverse_fini(th); + return (err); +} + +int +traverse_more(traverse_handle_t *th) +{ + zseg_t *zseg = list_head(&th->th_seglist); + uint64_t save_txg; /* XXX won't be necessary with real itinerary */ + krwlock_t *rw = spa_traverse_rwlock(th->th_spa); + blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); + int rc; + + if (zseg == NULL) + return (0); + + th->th_restarts++; + + save_txg = zseg->seg_mintxg; + + if (!(th->th_advance & ADVANCE_NOLOCK)) + rw_enter(rw, RW_READER); + + rc = traverse_segment(th, zseg, mosbp); + ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); + + if (!(th->th_advance & ADVANCE_NOLOCK)) + rw_exit(rw); + + zseg->seg_mintxg = save_txg; + + if (rc == ERANGE) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + return (EAGAIN); + } + + return (rc); +} + +/* + * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves + * are not included. The blocks covered by this segment will all have + * mintxg < birth < maxtxg. + */ +static void +traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, + uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) +{ + zseg_t *zseg; + + zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); + + zseg->seg_mintxg = mintxg; + zseg->seg_maxtxg = maxtxg; + + zseg->seg_start.zb_objset = sobjset; + zseg->seg_start.zb_object = sobject; + zseg->seg_start.zb_level = slevel; + zseg->seg_start.zb_blkid = sblkid; + + zseg->seg_end.zb_objset = eobjset; + zseg->seg_end.zb_object = eobject; + zseg->seg_end.zb_level = elevel; + zseg->seg_end.zb_blkid = eblkid; + + list_insert_tail(&th->th_seglist, zseg); +} + +void +traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset, uint64_t object) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, object, ZB_MAXLEVEL, 0, + objset, object, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, object, 0, 0, + objset, object, 0, ZB_MAXBLKID); +} + +void +traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, + uint64_t objset) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + objset, 0, -1, 0, + objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + objset, 1, 0, 0, + objset, 0, -1, 0); +} + +void +traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) +{ + if (th->th_advance & ADVANCE_PRE) + traverse_add_segment(th, mintxg, maxtxg, + 0, 0, -1, 0, + ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); + else + traverse_add_segment(th, mintxg, maxtxg, + 1, 1, 0, 0, + 0, 0, -1, 0); +} + +traverse_handle_t * +traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, + int zio_flags) +{ + traverse_handle_t *th; + int d, l; + + th = kmem_zalloc(sizeof (*th), KM_SLEEP); + + th->th_spa = spa; + th->th_func = func; + th->th_arg = arg; + th->th_advance = advance; + th->th_lastcb.zb_level = ZB_NO_LEVEL; + th->th_noread.zb_level = ZB_NO_LEVEL; + th->th_zio_flags = zio_flags; + + list_create(&th->th_seglist, sizeof (zseg_t), + offsetof(zseg_t, seg_node)); + + for (d = 0; d < ZB_DEPTH; d++) { + for (l = 0; l < ZB_MAXLEVEL; l++) { + if ((advance & ADVANCE_DATA) || + l != 0 || d != ZB_DN_CACHE) + th->th_cache[d][l].bc_data = + zio_buf_alloc(SPA_MAXBLOCKSIZE); + } + } + + return (th); +} + +void +traverse_fini(traverse_handle_t *th) +{ + int d, l; + zseg_t *zseg; + + for (d = 0; d < ZB_DEPTH; d++) + for (l = 0; l < ZB_MAXLEVEL; l++) + if (th->th_cache[d][l].bc_data != NULL) + zio_buf_free(th->th_cache[d][l].bc_data, + SPA_MAXBLOCKSIZE); + + while ((zseg = list_head(&th->th_seglist)) != NULL) { + list_remove(&th->th_seglist, zseg); + kmem_free(zseg, sizeof (*zseg)); + } + + list_destroy(&th->th_seglist); + + dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", + th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, + th->th_syncs, th->th_restarts); + + kmem_free(th, sizeof (*th)); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c new file mode 100644 index 0000000000..5dd827e946 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -0,0 +1,801 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dbuf.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ +#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_pool.h> +#include <sys/zap_impl.h> /* for ZAP_BLOCK_SHIFT */ +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef ZFS_DEBUG +int dmu_use_tx_debug_bufs = 1; +#endif + +dmu_tx_t * +dmu_tx_create_ds(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, dth_node)); + refcount_create(&tx->tx_space_written); + refcount_create(&tx->tx_space_freed); + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_ds(NULL); + + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj || tx->tx_privateobj); +} + +static void +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, dmu_tx_hold_func_t func, + uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *dth; + dnode_t *dn = NULL; + + if (object != DMU_NEW_OBJECT) { + dn = dnode_hold(os->os, object, tx); + + if (tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + ASSERT(dn->dn_assigned_tx == NULL); + dn->dn_assigned_txg = tx->tx_txg; + dn->dn_assigned_tx = tx; + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + dth->dth_dnode = dn; + dth->dth_type = type; + dth->dth_func = func; + dth->dth_arg1 = arg1; + dth->dth_arg2 = arg2; + /* + * XXX Investigate using a different data structure to keep + * track of dnodes in a tx. Maybe array, since there will + * generally not be many entries? + */ + list_insert_tail(&tx->tx_holds, dth); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) { + dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT, + NULL, 0, 0); + } +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + uint64_t start, end, space; + int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; + + if (len == 0) + return; + + min_bs = SPA_MINBLOCKSHIFT; + max_bs = SPA_MAXBLOCKSHIFT; + min_ibs = DN_MIN_INDBLKSHIFT; + max_ibs = DN_MAX_INDBLKSHIFT; + + /* + * If there's more than one block, the blocksize can't change, + * so we can make a more precise estimate. Alternatively, + * if the dnode's ibs is larger than max_ibs, always use that. + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on existing pools. + */ + if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { + min_ibs = max_ibs = dn->dn_indblkshift; + if (dn->dn_datablkshift != 0) + min_bs = max_bs = dn->dn_datablkshift; + } + + /* + * 'end' is the last thing we will access, not one past. + * This way we won't overflow when accessing the last byte. + */ + start = P2ALIGN(off, 1ULL << max_bs); + end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; + space = end - start + 1; + + start >>= min_bs; + end >>= min_bs; + + epbs = min_ibs - SPA_BLKPTRSHIFT; + + /* + * The object contains at most 2^(64 - min_bs) blocks, + * and each indirect level maps 2^epbs. + */ + for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { + start >>= epbs; + end >>= epbs; + /* + * If we increase the number of levels of indirection, + * we'll need new blkid=0 indirect blocks. If start == 0, + * we're already accounting for that blocks; and if end == 0, + * we can't increase the number of levels beyond that. + */ + if (start != 0 && end != 0) + space += 1ULL << max_ibs; + space += (end - start + 1) << max_ibs; + } + + ASSERT(space < 2 * DMU_MAX_ACCESS); + + tx->tx_space_towrite += space; +} + +static void +dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn) +{ + dnode_t *mdn = tx->tx_objset->os->os_meta_dnode; + uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1; + uint64_t pre_write_space; + + ASSERT(object < DN_MAX_OBJECT); + pre_write_space = tx->tx_space_towrite; + dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT); + if (dn && dn->dn_dbuf->db_blkptr && + dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_dbuf->db_blkptr->blk_birth, tx)) { + tx->tx_space_tooverwrite += + tx->tx_space_towrite - pre_write_space; + tx->tx_space_towrite = pre_write_space; + } +} + +/* ARGSUSED */ +static void +dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + dmu_tx_count_write(tx, dn, off, len); + dmu_tx_count_dnode(tx, dn); +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + ASSERT(tx->tx_txg == 0); + ASSERT(len > 0 && len < DMU_MAX_ACCESS); + ASSERT(UINT64_MAX - off >= len - 1); + + dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, + dmu_tx_hold_write_impl, off, len); +} + +static void +dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + uint64_t blkid, nblks; + uint64_t space = 0; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + + ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL); + + if (dn->dn_datablkshift == 0) + return; + /* + * not that the dnode can change, since it isn't dirty, but + * dbuf_hold_impl() wants us to have the struct_rwlock. + * also need it to protect dn_maxblkid. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = off >> dn->dn_datablkshift; + nblks = (off + len) >> dn->dn_datablkshift; + + if (blkid >= dn->dn_maxblkid) + goto out; + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; + + /* don't bother after the 100,000 blocks */ + nblks = MIN(nblks, 128*1024); + + if (dn->dn_phys->dn_nlevels == 1) { + int i; + for (i = 0; i < nblks; i++) { + blkptr_t *bp = dn->dn_phys->dn_blkptr; + ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); + bp += blkid + i; + if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) { + dprintf_bp(bp, "can free old%s", ""); + space += BP_GET_ASIZE(bp); + } + } + goto out; + } + + while (nblks) { + dmu_buf_impl_t *dbuf; + int err, epbs, blkoff, tochk; + + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + blkoff = P2PHASE(blkid, 1<<epbs); + tochk = MIN((1<<epbs) - blkoff, nblks); + + err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); + if (err == 0) { + int i; + blkptr_t *bp; + + dbuf_read_havestruct(dbuf); + + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, + bp[i].blk_birth, tx)) { + dprintf_bp(&bp[i], + "can free old%s", ""); + space += BP_GET_ASIZE(&bp[i]); + } + } + dbuf_remove_ref(dbuf, FTAG); + } else { + /* the indirect block is sparse */ + ASSERT(err == ENOENT); + } + + blkid += tochk; + nblks -= tochk; + } +out: + rw_exit(&dn->dn_struct_rwlock); + + tx->tx_space_tofree += space; +} + +static void +dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + int dirty; + + /* first block */ + if (off != 0 /* || dn->dn_maxblkid == 0 */) + dmu_tx_count_write(tx, dn, off, 1); + /* last block */ + if (len != DMU_OBJECT_END) + dmu_tx_count_write(tx, dn, off+len, 1); + + dmu_tx_count_dnode(tx, dn); + + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + + /* XXX locking */ + dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] | + dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3]; + if (dn->dn_assigned_tx != NULL && !dirty) + dmu_tx_count_free(tx, dn, off, len); +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + ASSERT(tx->tx_txg == 0); + + dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, + dmu_tx_hold_free_impl, off, len); +} + +/* ARGSUSED */ +static void +dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops) +{ + uint64_t nblocks; + int epbs; + + dmu_tx_count_dnode(tx, dn); + + if (dn == NULL) { + /* + * Assuming that nops+cops is not super huge, we will be + * able to fit a new object's entries into one leaf + * block. So there will be at most 2 blocks total, + * including the header block. + */ + dmu_tx_count_write(tx, dn, 0, 2 << ZAP_BLOCK_SHIFT); + return; + } + + ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + + if (dn->dn_maxblkid == 0 && nops == 0) { + /* + * If there is only one block (i.e. this is a micro-zap) + * and we are only doing updates, the accounting is simple. + */ + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + dn->dn_phys->dn_blkptr[0].blk_birth, tx)) + tx->tx_space_tooverwrite += dn->dn_datablksz; + else + tx->tx_space_towrite += dn->dn_datablksz; + return; + } + + /* + * 3 blocks overwritten per op: target leaf, ptrtbl block, header block + * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks + */ + dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz, + (nops * 6ULL + cops * 3ULL) << ZAP_BLOCK_SHIFT); + + /* + * If the modified blocks are scattered to the four winds, + * we'll have to modify an indirect twig for each. + */ + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) + tx->tx_space_towrite += + ((nops + cops) * 3ULL) << dn->dn_indblkshift; +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops) +{ + ASSERT(tx->tx_txg == 0); + + dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, + dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0)); +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + ASSERT(tx->tx_txg == 0); + + dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, + dmu_tx_hold_write_impl, 0, 0); +} + + +/* ARGSUSED */ +static void +dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn, + uint64_t space, uint64_t unused) +{ + tx->tx_space_towrite += space; +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + ASSERT(tx->tx_txg == 0); + + dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, + dmu_tx_hold_space_impl, space, 0); +} + +int +dmu_tx_holds(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *dth; + int holds = 0; + + /* + * By asserting that the tx is assigned, we're counting the + * number of dn_tx_holds, which is the same as the number of + * dn_holds. Otherwise, we'd be counting dn_holds, but + * dn_tx_holds could be 0. + */ + ASSERT(tx->tx_txg != 0); + + /* if (tx->tx_anyobj == TRUE) */ + /* return (0); */ + + for (dth = list_head(&tx->tx_holds); dth; + dth = list_next(&tx->tx_holds, dth)) { + if (dth->dth_dnode && dth->dth_dnode->dn_object == object) + holds++; + } + + return (holds); +} + +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ +#ifdef ZFS_DEBUG + dmu_tx_hold_t *dth; + int match_object = FALSE, match_offset = FALSE; + dnode_t *dn = db->db_dnode; + + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) + return; + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object & DMU_PRIVATE_OBJECT) + return; + + for (dth = list_head(&tx->tx_holds); dth; + dth = list_next(&tx->tx_holds, dth)) { + ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); + if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT) + match_object = TRUE; + if (dth->dth_dnode == NULL || dth->dth_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (dth->dth_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX dth_arg2 better not be zero... */ + + dprintf("found dth type %x beginblk=%llx endblk=%llx\n", + dth->dth_type, beginblk, endblk); + + switch (dth->dth_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * buffer so that we don't need to hold it + * when creating a new object. + */ + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + if (blkid == beginblk && + (dth->dth_arg1 != 0 || + dn->dn_maxblkid == 0)) + match_offset = TRUE; + if (blkid == endblk && + dth->dth_arg2 != DMU_OBJECT_END) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DB_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + ASSERT(!"bad dth_type"); + } + } + if (match_object && match_offset) + return; + } + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +#endif +} + +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth) +{ + dmu_tx_hold_t *dth; + uint64_t lsize, asize, fsize; + + *last_dth = NULL; + + tx->tx_space_towrite = 0; + tx->tx_space_tofree = 0; + tx->tx_space_tooverwrite = 0; + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + + if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) + return (ERESTART); + + for (dth = list_head(&tx->tx_holds); dth; + *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) { + dnode_t *dn = dth->dth_dnode; + if (dn != NULL) { + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_txg - 1) { + if (txg_how != TXG_WAIT) { + mutex_exit(&dn->dn_mtx); + return (ERESTART); + } + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + } + if (dn->dn_assigned_txg == 0) { + ASSERT(dn->dn_assigned_tx == NULL); + dn->dn_assigned_txg = tx->tx_txg; + dn->dn_assigned_tx = tx; + } else { + ASSERT(dn->dn_assigned_txg == tx->tx_txg); + if (dn->dn_assigned_tx != tx) + dn->dn_assigned_tx = NULL; + } + (void) refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + if (dth->dth_func) + dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2); + } + + /* + * Convert logical size to worst-case allocated size. + */ + fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) + + tx->tx_space_tofree; + lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite; + asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + tx->tx_space_towrite = asize; + + if (tx->tx_dir && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, + lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); + if (err) + return (err); + } + + return (0); +} + +static uint64_t +dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth) +{ + uint64_t txg = tx->tx_txg; + dmu_tx_hold_t *dth; + + ASSERT(txg != 0); + + txg_rele_to_quiesce(&tx->tx_txgh); + + for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) { + dnode_t *dn = dth->dth_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + dn->dn_assigned_tx = NULL; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_txg = 0; + return (txg); +} + +/* + * Assign tx to a transaction group. txg_how can be one of: + * + * (1) TXG_WAIT. If the current open txg is full, waits until there's + * a new one. This should be used when you're not holding locks. + * If will only fail if we're truly out of space (or over quota). + * + * (2) TXG_NOWAIT. If we can't assign into the current open txg without + * blocking, returns immediately with ERESTART. This should be used + * whenever you're holding locks. On an ERESTART error, the caller + * should drop locks, do a txg_wait_open(dp, 0), and try again. + * + * (3) A specific txg. Use this if you need to ensure that multiple + * transactions all sync in the same txg. Like TXG_NOWAIT, it + * returns ERESTART if it can't assign you into the requested txg. + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + dmu_tx_hold_t *last_dth; + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT(txg_how != 0); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + ASSERT3U(tx->tx_space_towrite, ==, 0); + ASSERT3U(tx->tx_space_tofree, ==, 0); + + while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) { + uint64_t txg = dmu_tx_unassign(tx, last_dth); + + if (err != ERESTART || txg_how != TXG_WAIT) + return (err); + + txg_wait_open(tx->tx_pool, txg + 1); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) +{ + if (tx->tx_dir == NULL || delta == 0) + return; + + if (delta > 0) { + ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, + tx->tx_space_towrite); + (void) refcount_add_many(&tx->tx_space_written, delta, NULL); + } else { + (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); + } +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + dmu_tx_hold_t *dth; + + ASSERT(tx->tx_txg != 0); + + while (dth = list_head(&tx->tx_holds)) { + dnode_t *dn = dth->dth_dnode; + + list_remove(&tx->tx_holds, dth); + kmem_free(dth, sizeof (dmu_tx_hold_t)); + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + dn->dn_assigned_tx = NULL; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + dnode_rele(dn, tx); + } + + if (tx->tx_dir && tx->tx_space_towrite > 0) { + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + } + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); + dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", + tx->tx_space_towrite, refcount_count(&tx->tx_space_written), + tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#ifdef ZFS_DEBUG + if (tx->tx_debug_buf) + kmem_free(tx->tx_debug_buf, 4096); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + dmu_tx_hold_t *dth; + + ASSERT(tx->tx_txg == 0); + + while (dth = list_head(&tx->tx_holds)) { + dnode_t *dn = dth->dth_dnode; + + list_remove(&tx->tx_holds, dth); + kmem_free(dth, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#ifdef ZFS_DEBUG + if (tx->tx_debug_buf) + kmem_free(tx->tx_debug_buf, 4096); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c new file mode 100644 index 0000000000..cfaeaf0674 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c @@ -0,0 +1,603 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_zfetch.h> +#include <sys/dmu.h> +#include <sys/dbuf.h> + +/* + * I'm against tune-ables, but these should probably exist as tweakable globals + * until we can get this working the way we want it to. + */ + +/* max # of streams per zfetch */ +uint32_t zfetch_max_streams = 8; +/* min time before stream reclaim */ +uint32_t zfetch_min_sec_reap = 2; +/* max number of blocks to fetch at a time */ +uint32_t zfetch_block_cap = 32; +/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +uint64_t zfetch_array_rd_sz = 1024 * 1024; + +/* forward decls for static routines */ +static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); +static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); +static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); +static int dmu_zfetch_find(zfetch_t *, zstream_t *); +static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); +static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); +static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); +static void dmu_zfetch_stream_update(zfetch_t *, zstream_t *); +static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); + + +/* + * Given a zfetch structure and a zstream structure, determine whether the + * blocks to be read are part of a co-linear to a pair of existing prefetch + * streams. If a set is found, coalesce the streams, removing one, and + * configure the prefetch so it looks for a strided access pattern. + * + * If no co-linear streams are found, return NULL. + */ +static int +dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) +{ + zstream_t *z_walk; + zstream_t *z_comp; + + rw_enter(&zf->zf_rwlock, RW_WRITER); + + if (zh == NULL) { + rw_exit(&zf->zf_rwlock); + return (0); + } + + for (z_walk = list_head(&zf->zf_stream); z_walk; + z_walk = list_next(&zf->zf_stream, z_walk)) { + for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; + z_comp = list_next(&zf->zf_stream, z_comp)) { + int64_t diff; + + if (z_walk->zst_len != z_walk->zst_stride || + z_comp->zst_len != z_comp->zst_stride) { + continue; + } + + diff = z_comp->zst_offset - z_walk->zst_offset; + if (z_comp->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + + diff = z_walk->zst_offset - z_comp->zst_offset; + if (z_walk->zst_offset + diff == zh->zst_offset) { + z_walk->zst_offset = zh->zst_offset; + z_walk->zst_direction = diff < 0 ? -1 : 1; + z_walk->zst_stride = + diff * z_walk->zst_direction; + z_walk->zst_ph_offset = + zh->zst_offset + z_walk->zst_stride; + dmu_zfetch_stream_remove(zf, z_comp); + mutex_destroy(&z_comp->zst_lock); + kmem_free(z_comp, sizeof (zstream_t)); + + dmu_zfetch_dofetch(zf, z_walk); + + rw_exit(&zf->zf_rwlock); + return (1); + } + } + } + + rw_exit(&zf->zf_rwlock); + return (0); +} + +/* + * Given a zstream_t, determine the bounds of the prefetch. Then call the + * routine that actually prefetches the individual blocks. + */ +static void +dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) +{ + uint64_t prefetch_tail; + uint64_t prefetch_limit; + uint64_t prefetch_ofst; + uint64_t prefetch_len; + uint64_t blocks_fetched; + + zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); + zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); + + prefetch_tail = MAX((int64_t)zs->zst_ph_offset, + (int64_t)(zs->zst_offset + zs->zst_stride)); + /* + * XXX: use a faster division method? + */ + prefetch_limit = zs->zst_offset + zs->zst_len + + (zs->zst_cap * zs->zst_stride) / zs->zst_len; + + while (prefetch_tail < prefetch_limit) { + prefetch_ofst = zs->zst_offset + zs->zst_direction * + (prefetch_tail - zs->zst_offset); + + prefetch_len = zs->zst_len; + + /* + * Don't prefetch beyond the end of the file, if working + * backwards. + */ + if ((zs->zst_direction == ZFETCH_BACKWARD) && + (prefetch_ofst > prefetch_tail)) { + prefetch_len += prefetch_ofst; + prefetch_ofst = 0; + } + + /* don't prefetch more than we're supposed to */ + if (prefetch_len > zs->zst_len) + break; + + blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, + prefetch_ofst, zs->zst_len); + + prefetch_tail += zs->zst_stride; + /* stop if we've run out of stuff to prefetch */ + if (blocks_fetched < zs->zst_len) + break; + } + zs->zst_ph_offset = prefetch_tail; + zs->zst_last = lbolt; +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) { + return; + } + + zf->zf_dnode = dno; + zf->zf_stream_cnt = 0; + zf->zf_alloc_fail = 0; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zst_node)); + + rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); +} + +/* + * This function computes the actual size, in blocks, that can be prefetched, + * and fetches it. + */ +static uint64_t +dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + uint64_t i; + + fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); + + for (i = 0; i < fetchsz; i++) { + dbuf_prefetch(dn, blkid + i); + } + + return (fetchsz); +} + +/* + * this function returns the number of blocks that would be prefetched, based + * upon the supplied dnode, blockid, and nblks. This is used so that we can + * update streams in place, and then prefetch with their old value after the + * fact. This way, we can delay the prefetch, but subsequent accesses to the + * stream won't result in the same data being prefetched multiple times. + */ +static uint64_t +dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) +{ + uint64_t fetchsz; + + if (blkid > dn->dn_maxblkid) { + return (0); + } + + /* compute fetch size */ + if (blkid + nblks > dn->dn_maxblkid) { + fetchsz = dn->dn_maxblkid - blkid; + ASSERT(blkid + fetchsz <= dn->dn_maxblkid); + } else { + fetchsz = nblks; + } + + + return (fetchsz); +} + +/* + * given a zfetch and a zsearch structure, see if there is an associated zstream + * for this block read. If so, it starts a prefetch for the stream it + * located and returns true, otherwise it returns false + */ +static int +dmu_zfetch_find(zfetch_t *zf, zstream_t *zh) +{ + zstream_t *zs; + int64_t diff; + int rc = 0; + + if (zh == NULL) + return (0); + + /* + * XXX: This locking strategy is a bit coarse; however, it's impact has + * yet to be tested. If this turns out to be an issue, it can be + * modified in a number of different ways. + */ + + rw_enter(&zf->zf_rwlock, RW_READER); +top: + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + + if (zs->zst_len == 0) { + /* bogus stream */ + continue; + } + + if (zh->zst_offset - zs->zst_offset < zs->zst_len) { + /* already fetched */ + rw_exit(&zf->zf_rwlock); + return (1); + } + + if (zh->zst_offset == zs->zst_offset + zs->zst_len) { + /* forward sequential access */ + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset + zs->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_len += zh->zst_len; + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_offset += diff; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : 0; + } + zs->zst_direction = ZFETCH_FORWARD; + + break; + + } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { + /* backwards sequential access */ + + mutex_enter(&zs->zst_lock); + + if (zh->zst_offset != zs->zst_offset - zh->zst_len) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zh->zst_len ? + zs->zst_offset - zh->zst_len : 0; + zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? + zs->zst_ph_offset - zh->zst_len : 0; + zs->zst_len += zh->zst_len; + + diff = zs->zst_len - zfetch_block_cap; + if (diff > 0) { + zs->zst_ph_offset = zs->zst_ph_offset > diff ? + zs->zst_ph_offset - diff : 0; + zs->zst_len = zs->zst_len > diff ? + zs->zst_len - diff : zs->zst_len; + } + zs->zst_direction = ZFETCH_BACKWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided forward access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset += zs->zst_stride; + zs->zst_direction = ZFETCH_FORWARD; + + break; + + } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < + zs->zst_len) && (zs->zst_len != zs->zst_stride)) { + /* strided reverse access */ + + mutex_enter(&zs->zst_lock); + + if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= + zs->zst_len) || (zs->zst_len == zs->zst_stride)) { + mutex_exit(&zs->zst_lock); + goto top; + } + + zs->zst_offset = zs->zst_offset > zs->zst_stride ? + zs->zst_offset - zs->zst_stride : 0; + zs->zst_ph_offset = (zs->zst_ph_offset > + (2 * zs->zst_stride)) ? + (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; + zs->zst_direction = ZFETCH_BACKWARD; + + break; + } + } + + if (zs) { + rc = 1; + dmu_zfetch_dofetch(zf, zs); + mutex_exit(&zs->zst_lock); + } + + rw_exit(&zf->zf_rwlock); + return (rc); +} + +/* + * Clean-up state associated with a zfetch structure. This frees allocated + * structure members, empties the zf_stream tree, and generally makes things + * nice. This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_rele(zfetch_t *zf) +{ + zstream_t *zs; + zstream_t *zs_next; + + ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); + + for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + } + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); + + zf->zf_dnode = NULL; +} + +/* + * Given a zfetch and zstream structure, insert the zstream structure into the + * AVL tree contained within the zfetch structure. Peform the appropriate + * book-keeping. It is possible that another thread has inserted a stream which + * matches one that we are about to insert, so we must be sure to check for this + * case. If one is found, return failure, and let the caller cleanup the + * duplicates. + */ +static int +dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +{ + zstream_t *zs_walk; + zstream_t *zs_next; + + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { + zs_next = list_next(&zf->zf_stream, zs_walk); + + if (dmu_zfetch_streams_equal(zs_walk, zs)) { + return (0); + } + } + + list_insert_head(&zf->zf_stream, zs); + zf->zf_stream_cnt++; + + return (1); +} + + +/* + * Walk the list of zstreams in the given zfetch, find an old one (by time), and + * reclaim it for use by the caller. + */ +static zstream_t * +dmu_zfetch_stream_reclaim(zfetch_t *zf) +{ + zstream_t *zs; + + rw_enter(&zf->zf_rwlock, RW_WRITER); + + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + + if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) + break; + } + + if (zs) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + bzero(zs, sizeof (zstream_t)); + } else { + zf->zf_alloc_fail++; + } + rw_exit(&zf->zf_rwlock); + + return (zs); +} + +/* + * Given a zfetch and zstream structure, remove the zstream structure from its + * container in the zfetch structure. Perform the appropriate book-keeping. + */ +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + + list_remove(&zf->zf_stream, zs); + zf->zf_stream_cnt--; +} + +static int +dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) +{ + if (zs1->zst_offset != zs2->zst_offset) + return (0); + + if (zs1->zst_len != zs2->zst_len) + return (0); + + if (zs1->zst_stride != zs2->zst_stride) + return (0); + + if (zs1->zst_ph_offset != zs2->zst_ph_offset) + return (0); + + if (zs1->zst_cap != zs2->zst_cap) + return (0); + + if (zs1->zst_direction != zs2->zst_direction) + return (0); + + return (1); +} + +/* + * This is the prefetch entry point. It calls all of the other dmu_zfetch + * routines to create, delete, find, or operate upon prefetch streams. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size) +{ + zstream_t zst; + zstream_t *newstream; + int fetched; + int inserted; + unsigned int blkshft; + uint64_t blksz; + + /* files that aren't ln2 blocksz are only one block -- nothing to do */ + if (!zf->zf_dnode->dn_datablkshift) { + return; + } + + /* convert offset and size, into blockid and nblocks */ + blkshft = zf->zf_dnode->dn_datablkshift; + blksz = (1 << blkshft); + + bzero(&zst, sizeof (zstream_t)); + zst.zst_offset = offset >> blkshft; + zst.zst_len = (P2ROUNDUP(offset + size, blksz) - + P2ALIGN(offset, blksz)) >> blkshft; + + fetched = dmu_zfetch_find(zf, &zst); + if (!fetched) { + fetched = dmu_zfetch_colinear(zf, &zst); + } + + if (!fetched) { + newstream = dmu_zfetch_stream_reclaim(zf); + + /* + * we still couldn't find a stream, drop the lock, and allocate + * one if possible. Otherwise, give up and go home. + */ + if (newstream == NULL) { + uint64_t maxblocks; + uint32_t max_streams; + uint32_t cur_streams; + + cur_streams = zf->zf_stream_cnt; + maxblocks = zf->zf_dnode->dn_maxblkid; + + max_streams = MIN(zfetch_max_streams, + (maxblocks / zfetch_block_cap)); + if (max_streams == 0) { + max_streams++; + } + + if (cur_streams >= max_streams) { + return; + } + + newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); + } + + newstream->zst_offset = zst.zst_offset; + newstream->zst_len = zst.zst_len; + newstream->zst_stride = zst.zst_len; + newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; + newstream->zst_cap = zst.zst_len; + newstream->zst_direction = ZFETCH_FORWARD; + newstream->zst_last = lbolt; + + mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); + + rw_enter(&zf->zf_rwlock, RW_WRITER); + inserted = dmu_zfetch_stream_insert(zf, newstream); + rw_exit(&zf->zf_rwlock); + + if (!inserted) { + mutex_destroy(&newstream->zst_lock); + kmem_free(newstream, sizeof (zstream_t)); + } + } +} diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c new file mode 100644 index 0000000000..6b25b35ab1 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -0,0 +1,1304 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_impl.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu_zfetch.h> + +static int free_range_compar(const void *node1, const void *node2); + +static kmem_cache_t *dnode_cache; + +static dnode_phys_t dnode_phys_zero; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + int i; + dnode_t *dn = arg; + bzero(dn, sizeof (dnode_t)); + + rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + refcount_create(&dn->dn_holds); + refcount_create(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_create(&dn->dn_ranges[i], free_range_compar, + sizeof (free_range_t), + offsetof(struct free_range, fr_node)); + list_create(&dn->dn_dirty_dbufs[i], + sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_dirty_node[i])); + } + + list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + refcount_destroy(&dn->dn_holds); + refcount_destroy(&dn->dn_tx_holds); + + for (i = 0; i < TXG_SIZE; i++) { + avl_destroy(&dn->dn_ranges[i]); + list_destroy(&dn->dn_dirty_dbufs[i]); + } + + list_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + dnode_cache = kmem_cache_create("dnode_t", + sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); +} + +void +dnode_fini(void) +{ + kmem_cache_destroy(dnode_cache); +} + + +void +dnode_verify(dnode_t *dn) +{ +#ifdef ZFS_DEBUG + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + ASSERT3U(dn->dn_indblkshift, >=, 0); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +#endif +} + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_type = BSWAP_8(dnp->dn_type); + dnp->dn_indblkshift = BSWAP_8(dnp->dn_indblkshift); + dnp->dn_nlevels = BSWAP_8(dnp->dn_nlevels); + dnp->dn_nblkptr = BSWAP_8(dnp->dn_nblkptr); + dnp->dn_bonustype = BSWAP_8(dnp->dn_bonustype); + dnp->dn_checksum = BSWAP_8(dnp->dn_checksum); + dnp->dn_compress = BSWAP_8(dnp->dn_compress); + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_secphys = BSWAP_64(dnp->dn_secphys); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + size_t len = DN_MAX_BONUSLEN - off; + dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + } +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + dnode_phys_t *buf = vbuf; + int i; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); + ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + + size >>= DNODE_SHIFT; + for (i = 0; i < size; i++) { + dnode_byteswap(buf); + buf++; + } +} + +static int +free_range_compar(const void *node1, const void *node2) +{ + const free_range_t *rp1 = node1; + const free_range_t *rp2 = node2; + + if (rp1->fr_blkid < rp2->fr_blkid) + return (-1); + else if (rp1->fr_blkid > rp2->fr_blkid) + return (1); + else return (0); +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object) +{ + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + (void) dnode_cons(dn, NULL, 0); /* XXX */ + + dn->dn_objset = os; + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_maxblkid = dnp->dn_maxblkid; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); + list_insert_head(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + return (dn); +} + +static void +dnode_destroy(dnode_t *dn) +{ + objset_impl_t *os = dn->dn_objset; + + mutex_enter(&os->os_lock); + list_remove(&os->os_dnodes, dn); + mutex_exit(&os->os_lock); + + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + dmu_zfetch_rele(&dn->dn_zfetch); + kmem_cache_free(dnode_cache, dn); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int i; + + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + + blocksize = MIN(MAX(blocksize, SPA_MINBLOCKSIZE), SPA_MAXBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, + dn->dn_object, tx->tx_txg, blocksize, ibs); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT3U(dn->dn_maxblkid, ==, 0); + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_dirtyblksz[i], ==, 0); + ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL); + ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + if (dn->dn_dirtyctx_firstset) { + kmem_free(dn->dn_dirtyctx_firstset, 1); + dn->dn_dirtyctx_firstset = NULL; + } + + dn->dn_allocated_txg = tx->tx_txg; + dnode_setdirty(dn, tx); +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = NULL; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); + ASSERT(dn->dn_dirtyblksz[0] == 0); + ASSERT(dn->dn_dirtyblksz[1] == 0); + ASSERT(dn->dn_dirtyblksz[2] == 0); + ASSERT(dn->dn_dirtyblksz[3] == 0); + + /* + * XXX I should really have a generation number to tell if we + * need to do this... + */ + if (blocksize != dn->dn_datablksz || + dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { + /* free all old data */ + dnode_free_range(dn, 0, -1ULL, tx); + } + + /* change blocksize */ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_setdblksz(dn, blocksize); + dnode_setdirty(dn, tx); + /* don't need dd_dirty_mtx, dnode is already dirty */ + ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0); + dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize; + rw_exit(&dn->dn_struct_rwlock); + + /* change type */ + dn->dn_type = ot; + + if (dn->dn_bonuslen != bonuslen) { + /* change bonus size */ + if (bonuslen == 0) + bonuslen = 1; /* XXX */ + db = dbuf_hold_bonus(dn, FTAG); + dbuf_read(db); + mutex_enter(&db->db_mtx); + ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT(db->db.db_data != NULL); + db->db.db_size = bonuslen; + mutex_exit(&db->db_mtx); + dbuf_dirty(db, tx); + } + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + if (db) + dbuf_remove_ref(db, FTAG); +} + +void +dnode_special_close(dnode_t *dn) +{ + dnode_destroy(dn); +} + +dnode_t * +dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) +{ + dnode_t *dn = dnode_create(os, dnp, NULL, object); + dnode_verify(dn); + return (dn); +} + +static void +dnode_buf_pageout(dmu_buf_t *db, void *arg) +{ + dnode_t **children_dnodes = arg; + int i; + int epb = db->db_size >> DNODE_SHIFT; + + for (i = 0; i < epb; i++) { + dnode_t *dn = children_dnodes[i]; + int n; + + if (dn == NULL) + continue; +#ifdef ZFS_DEBUG + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligable for eviction and this function + * would not have been called. + */ + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(list_head(&dn->dn_dbufs) == NULL); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + for (n = 0; n < TXG_SIZE; n++) + ASSERT(dn->dn_dirtyblksz[n] == 0); +#endif + children_dnodes[i] = NULL; + dnode_destroy(dn); + } + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); +} + +/* + * Returns held dnode if the object number is valid, NULL if not. + * Note that this will succeed even for free dnodes. + */ +dnode_t * +dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref) +{ + int epb, idx; + int drop_struct_lock = FALSE; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_t **children_dnodes; + + if (object == 0 || object >= DN_MAX_OBJECT) + return (NULL); + + mdn = os->os_meta_dnode; + + dnode_verify(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + + db = dbuf_hold(mdn, blk); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + dbuf_read(db); + + ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); + epb = db->db.db_size >> DNODE_SHIFT; + + idx = object & (epb-1); + + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + dnode_t **winner; + children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), + KM_SLEEP); + if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, + dnode_buf_pageout)) { + kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + children_dnodes = winner; + } + } + + if ((dn = children_dnodes[idx]) == NULL) { + dnode_t *winner; + dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, + db, object); + winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + if (winner != NULL) { + dnode_destroy(dn); + dn = winner; + } + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_free_txg || + ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) { + mutex_exit(&dn->dn_mtx); + dbuf_rele(db); + return (NULL); + } + mutex_exit(&dn->dn_mtx); + + if (refcount_add(&dn->dn_holds, ref) == 1) + dbuf_add_ref(db, dn); + + dnode_verify(dn); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db); + + return (dn); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +dnode_t * +dnode_hold(objset_impl_t *os, uint64_t object, void *ref) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref)); +} + +void +dnode_add_ref(dnode_t *dn, void *ref) +{ + ASSERT(refcount_count(&dn->dn_holds) > 0); + (void) refcount_add(&dn->dn_holds, ref); +} + +void +dnode_rele(dnode_t *dn, void *ref) +{ + uint64_t refs; + + refs = refcount_remove(&dn->dn_holds, ref); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && dn->dn_dbuf) + dbuf_remove_ref(dn->dn_dbuf, dn); +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (IS_DNODE_DNODE(dn->dn_object)) + return; + + dnode_verify(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + mutex_exit(&dn->dn_mtx); +#endif + + mutex_enter(&os->os_lock); + + /* + * If we are already marked dirty, we're done. + */ + if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) { + mutex_exit(&os->os_lock); + return; + } + + ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz; + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { + list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); + } else { + list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); + } + + mutex_exit(&os->os_lock); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintaines a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + (void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg); + + dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); + + /* we should be the only holder... hopefully */ + /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + /* + * If the dnode is already dirty, it needs to be moved from + * the dirty list to the free list. + */ + mutex_enter(&dn->dn_objset->os_lock); + if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) { + list_remove( + &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn); + list_insert_tail( + &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn); + mutex_exit(&dn->dn_objset->os_lock); + } else { + mutex_exit(&dn->dn_objset->os_lock); + dnode_setdirty(dn, tx); + } +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db, *db_next; + int have_db0 = FALSE; + int err = ENOTSUP; + + if (size == 0) + size = SPA_MINBLOCKSIZE; + if (size > SPA_MAXBLOCKSIZE) + size = SPA_MAXBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = dn->dn_indblkshift; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && + ibs == dn->dn_indblkshift) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_phys->dn_maxblkid != 0) + goto end; + + /* + * Any buffers allocated for blocks beyond the first + * must be evictable/evicted, because they're the wrong size. + */ + mutex_enter(&dn->dn_dbufs_mtx); + /* + * Since we have the dn_dbufs_mtx, nothing can be + * removed from dn_dbufs. Since we have dn_struct_rwlock/w, + * nothing can be added to dn_dbufs. + */ + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + db_next = list_next(&dn->dn_dbufs, db); + + if (db->db_blkid == 0) { + have_db0 = TRUE; + } else if (db->db_blkid != DB_BONUS_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto end; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + /* Fast-track if there is no data in the file */ + if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) { + dnode_setdblksz(dn, size); + dn->dn_indblkshift = ibs; + dnode_setdirty(dn, tx); + /* don't need dd_dirty_mtx, dnode is already dirty */ + dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + rw_exit(&dn->dn_struct_rwlock); + return (0); + } + + /* obtain the old block */ + db = dbuf_hold(dn, 0); + + /* Not allowed to decrease the size if there is data present */ + if (size < db->db.db_size) { + dbuf_rele(db); + goto end; + } + + dbuf_new_size(db, size, tx); + + dnode_setdblksz(dn, size); + dn->dn_indblkshift = ibs; + /* don't need dd_dirty_mtx, dnode is already dirty */ + dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + dbuf_rele(db); + + err = 0; +end: + rw_exit(&dn->dn_struct_rwlock); + return (err); +} + +uint64_t +dnode_max_nonzero_offset(dnode_t *dn) +{ + if (dn->dn_phys->dn_maxblkid == 0 && + BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0])) + return (0); + else + return ((dn->dn_phys->dn_maxblkid+1) * dn->dn_datablksz); +} + +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int drop_struct_lock = FALSE; + int epbs, old_nlevels, new_nlevels; + uint64_t sz; + + if (blkid == DB_BONUS_BLKID) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + drop_struct_lock = TRUE; + } + + if (blkid > dn->dn_maxblkid) + dn->dn_maxblkid = blkid; + + /* + * Compute the number of levels necessary to support the + * new blkid. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; + sz <<= epbs) + new_nlevels++; + old_nlevels = dn->dn_nlevels; + + if (new_nlevels > dn->dn_next_nlevels[txgoff]) + dn->dn_next_nlevels[txgoff] = new_nlevels; + + if (new_nlevels > old_nlevels) { + dprintf("dn %p increasing nlevels from %u to %u\n", + dn, dn->dn_nlevels, new_nlevels); + dn->dn_nlevels = new_nlevels; + } + + /* + * Dirty the left indirects. + * Note: the caller should have just dnode_use_space()'d one + * data block's worth, so we could subtract that out of + * dn_inflight_data to determine if there is any dirty data + * besides this block. + * We don't strictly need to dirty them unless there's + * *something* in the object (eg. on disk or dirty)... + */ + if (new_nlevels > old_nlevels) { + dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + dprintf("dn %p dirtying left indirects\n", dn); + dbuf_dirty(db, tx); + dbuf_remove_ref(db, FTAG); + } +#ifdef ZFS_DEBUG + else if (old_nlevels > 1 && new_nlevels > old_nlevels) { + dmu_buf_impl_t *db; + int i; + + for (i = 0; i < dn->dn_nblkptr; i++) { + db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG); + ASSERT(! + list_link_active(&db->db_dirty_node[txgoff])); + dbuf_remove_ref(db, FTAG); + } + } +#endif + + dprintf("dn %p done\n", dn); + +out: + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + avl_index_t where; + free_range_t *rp; + free_range_t rp_tofind; + uint64_t endblk = blkid + nblks; + + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ + + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + rp_tofind.fr_blkid = blkid; + rp = avl_find(tree, &rp_tofind, &where); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_BEFORE); + if (rp == NULL) + rp = avl_nearest(tree, where, AVL_AFTER); + + while (rp && (rp->fr_blkid <= blkid + nblks)) { + uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; + free_range_t *nrp = AVL_NEXT(tree, rp); + + if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { + /* clear this entire range */ + avl_remove(tree, rp); + kmem_free(rp, sizeof (free_range_t)); + } else if (blkid <= rp->fr_blkid && + endblk > rp->fr_blkid && endblk < fr_endblk) { + /* clear the beginning of this range */ + rp->fr_blkid = endblk; + rp->fr_nblks = fr_endblk - endblk; + } else if (blkid > rp->fr_blkid && blkid < fr_endblk && + endblk >= fr_endblk) { + /* clear the end of this range */ + rp->fr_nblks = blkid - rp->fr_blkid; + } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { + /* clear a chunk out of this range */ + free_range_t *new_rp = + kmem_alloc(sizeof (free_range_t), KM_SLEEP); + + new_rp->fr_blkid = endblk; + new_rp->fr_nblks = fr_endblk - endblk; + avl_insert_here(tree, new_rp, rp, AVL_AFTER); + rp->fr_nblks = blkid - rp->fr_blkid; + } + /* there may be no overlap */ + rp = nrp; + } +} + +void +dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint64_t start, objsize, blkid, nblks; + int blkshift, blksz, tail, head, epbs; + int trunc = FALSE; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + /* If the range is past the end of the file, this is a no-op */ + objsize = blksz * (dn->dn_maxblkid+1); + if (off >= objsize) + goto out; + if (len == -1ULL) { + len = UINT64_MAX - off; + trunc = TRUE; + } + + /* + * First, block align the region to free: + */ + if (dn->dn_maxblkid == 0) { + if (off == 0) { + head = 0; + } else { + head = blksz - off; + ASSERT3U(head, >, 0); + } + start = off; + } else { + ASSERT(ISP2(blksz)); + head = P2NPHASE(off, blksz); + start = P2PHASE(off, blksz); + } + /* zero out any partial block data at the start of the range */ + if (head) { + ASSERT3U(start + head, ==, blksz); + if (len < head) + head = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, + FTAG, &db) == 0) { + caddr_t data; + + /* don't dirty if it isn't on disk and isn't dirty */ + if (db->db_dirtied || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + data = db->db.db_data; + bzero(data + start, head); + } + dbuf_remove_ref(db, FTAG); + } + off += head; + len -= head; + } + /* If the range was less than one block, we are done */ + if (len == 0) + goto out; + + /* If the remaining range is past the end of the file, we are done */ + if (off > dn->dn_maxblkid << blkshift) + goto out; + + if (off + len == UINT64_MAX) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if it isn't on disk and isn't dirty */ + if (db->db_dirtied || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); + } + dbuf_remove_ref(db, FTAG); + } + len -= tail; + } + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + /* dirty the left indirects */ + if (dn->dn_nlevels > 1 && off != 0) { + db = dbuf_hold_level(dn, 1, + (off - head) >> (blkshift + epbs), FTAG); + dbuf_will_dirty(db, tx); + dbuf_remove_ref(db, FTAG); + } + + /* dirty the right indirects */ + if (dn->dn_nlevels > 1 && !trunc) { + db = dbuf_hold_level(dn, 1, + (off + len + tail - 1) >> (blkshift + epbs), FTAG); + dbuf_will_dirty(db, tx); + dbuf_remove_ref(db, FTAG); + } + + /* + * Finally, add this range to the dnode range list, we + * will finish up this free operation in the syncing phase. + */ + ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); + ASSERT(off + len == UINT64_MAX || IS_P2ALIGNED(len, 1<<blkshift)); + blkid = off >> blkshift; + nblks = len >> blkshift; + + if (trunc) + dn->dn_maxblkid = (blkid ? blkid - 1 : 0); + + mutex_enter(&dn->dn_mtx); + dnode_clear_range(dn, blkid, nblks, tx); + { + free_range_t *rp, *found; + avl_index_t where; + avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; + + /* Add new range to dn_ranges */ + rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); + rp->fr_blkid = blkid; + rp->fr_nblks = nblks; + found = avl_find(tree, rp, &where); + ASSERT(found == NULL); + avl_insert(tree, rp, where); + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + } + mutex_exit(&dn->dn_mtx); + + dbuf_free_range(dn, blkid, nblks, tx); + dnode_setdirty(dn, tx); +out: + rw_exit(&dn->dn_struct_rwlock); +} + +/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ +uint64_t +dnode_block_freed(dnode_t *dn, uint64_t blkid) +{ + free_range_t range_tofind; + void *dp = spa_get_dsl(dn->dn_objset->os_spa); + int i; + + if (blkid == DB_BONUS_BLKID) + return (FALSE); + + /* + * If we're in the process of opening the pool, dp will not be + * set yet, but there shouldn't be anything dirty. + */ + if (dp == NULL) + return (FALSE); + + if (dn->dn_free_txg) + return (TRUE); + + /* + * If dn_datablkshift is not set, then there's only a single + * block, in which case there will never be a free range so it + * won't matter. + */ + range_tofind.fr_blkid = blkid; + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + free_range_t *range_found; + avl_index_t idx; + + range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); + if (range_found) { + ASSERT(range_found->fr_nblks > 0); + break; + } + range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); + if (range_found && + range_found->fr_blkid + range_found->fr_nblks > blkid) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* call from syncing context when we actually write/free space for this dnode */ +void +dnode_diduse_space(dnode_t *dn, int64_t space) +{ + uint64_t sectors; + + dprintf_dnode(dn, "dn=%p dnp=%p secphys=%llu space=%lld\n", + dn, dn->dn_phys, + (u_longlong_t)dn->dn_phys->dn_secphys, + (longlong_t)space); + + ASSERT(P2PHASE(space, 1<<DEV_BSHIFT) == 0); + + mutex_enter(&dn->dn_mtx); + if (space > 0) { + sectors = space >> DEV_BSHIFT; + ASSERT3U(dn->dn_phys->dn_secphys + sectors, >=, + dn->dn_phys->dn_secphys); + dn->dn_phys->dn_secphys += sectors; + } else { + sectors = -space >> DEV_BSHIFT; + ASSERT3U(dn->dn_phys->dn_secphys, >=, sectors); + dn->dn_phys->dn_secphys -= sectors; + } + mutex_exit(&dn->dn_mtx); +} + +/* + * Call when we think we're going to write/free space in open context. + * Be conservative (ie. OK to write less than this or free more than + * this, but don't write more or free less). + */ +void +dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + + if (space > 0) + space = spa_get_asize(os->os_spa, space); + + if (ds) + dsl_dir_willuse_space(ds->ds_dir, space, tx); + + dmu_tx_willuse_space(tx, space); +} + +static int +dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, + int lvl, uint64_t blkfill) +{ + dmu_buf_impl_t *db = NULL; + void *data = NULL; + uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t epb = 1ULL << epbs; + uint64_t minfill, maxfill; + int i, error, span; + + dprintf("probing object %llu offset %llx level %d of %u\n", + dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + + if (lvl == dn->dn_phys->dn_nlevels) { + error = 0; + epb = dn->dn_phys->dn_nblkptr; + data = dn->dn_phys->dn_blkptr; + } else { + uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + if (error) { + if (error == ENOENT) + return (hole ? 0 : ESRCH); + return (error); + } + dbuf_read_havestruct(db); + data = db->db.db_data; + } + + if (lvl == 0) { + dnode_phys_t *dnp = data; + span = DNODE_SHIFT; + ASSERT(dn->dn_type == DMU_OT_DNODE); + + for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { + if (!dnp[i].dn_type == hole) + break; + *offset += 1ULL << span; + } + if (i == blkfill) + error = ESRCH; + } else { + blkptr_t *bp = data; + span = (lvl - 1) * epbs + dn->dn_datablkshift; + minfill = 0; + maxfill = blkfill << ((lvl - 1) * epbs); + + if (hole) + maxfill--; + else + minfill++; + + for (i = (*offset >> span) & ((1ULL << epbs) - 1); + i < epb; i++) { + if (bp[i].blk_fill >= minfill && + bp[i].blk_fill <= maxfill) + break; + *offset += 1ULL << span; + } + if (i >= epb) + error = ESRCH; + } + + if (db) + dbuf_remove_ref(db, FTAG); + + return (error); +} + +/* + * Find the next hole, data, or sparse region at or after *offset. + * The value 'blkfill' tells us how many items we expect to find + * in an L0 data block; this value is 1 for normal objects, + * DNODES_PER_BLOCK for the meta dnode, and some fraction of + * DNODES_PER_BLOCK when searching for sparse regions thereof. + * Examples: + * + * dnode_next_offset(dn, hole, offset, 1, 1); + * Finds the next hole/data in a file. + * Used in dmu_offset_next(). + * + * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK); + * Finds the next free/allocated dnode an objset's meta-dnode. + * Used in dmu_object_next(). + * + * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2); + * Finds the next L2 meta-dnode bp that's at most 1/4 full. + * Used in dmu_object_alloc(). + */ +int +dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, + int minlvl, uint64_t blkfill) +{ + int lvl, maxlvl; + int error = 0; + uint64_t initial_offset = *offset; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (dn->dn_phys->dn_nlevels == 0) { + rw_exit(&dn->dn_struct_rwlock); + return (ESRCH); + } + + if (dn->dn_datablkshift == 0) { + if (*offset < dn->dn_datablksz) { + if (hole) + *offset = dn->dn_datablksz; + } else { + error = ESRCH; + } + rw_exit(&dn->dn_struct_rwlock); + return (error); + } + + maxlvl = dn->dn_phys->dn_nlevels; + + for (lvl = minlvl; lvl <= maxlvl; lvl++) { + error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill); + if (error == 0) + break; + } + + while (--lvl >= minlvl && error == 0) + error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill); + + rw_exit(&dn->dn_struct_rwlock); + + if (initial_offset > *offset) + return (ESRCH); + + return (error); +} diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c new file mode 100644 index 0000000000..56fc3e19ae --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -0,0 +1,560 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/dbuf.h> +#include <sys/dnode.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/spa.h> +#include <sys/zio.h> + + +static void +dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int i; + uint64_t txg = tx->tx_txg; + + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + /* this dnode can't be paged out because it's dirty */ + + db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) + break; + if (i != dn->dn_phys->dn_nblkptr) { + ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK])); + + dbuf_read_havestruct(db); + arc_release(db->db_buf, db); + /* copy dnode's block pointers to new indirect block */ + ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=, + db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr); + } + + dn->dn_phys->dn_nlevels += 1; + dprintf("os=%p obj=%llu, increase to %d\n", + dn->dn_objset, dn->dn_object, + dn->dn_phys->dn_nlevels); + + /* set dbuf's parent pointers to new indirect buf */ + for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) { + dmu_buf_impl_t *child = + dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i); + if (child == NULL) + continue; + if (child->db_dnode == NULL) { + mutex_exit(&child->db_mtx); + continue; + } + + if (child->db_parent == NULL || + child->db_parent == dn->dn_dbuf) { + dprintf_dbuf_bp(child, child->db_blkptr, + "changing db_blkptr to new indirect %s", ""); + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) { + child->db_blkptr = + (blkptr_t *)db->db.db_data + i; + } else { + child->db_blkptr = NULL; + } + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); + } + ASSERT3P(child->db_parent, ==, db); + + mutex_exit(&child->db_mtx); + } + + bzero(dn->dn_phys->dn_blkptr, + sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr); + + dbuf_remove_ref(db, FTAG); +} + +static void +free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) +{ + objset_impl_t *os = dn->dn_objset; + uint64_t bytesfreed = 0; + int i; + + dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); + + for (i = 0; i < num; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + + bytesfreed += BP_GET_ASIZE(bp); + ASSERT3U(bytesfreed >> DEV_BSHIFT, <=, dn->dn_phys->dn_secphys); + dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx); + } + dnode_diduse_space(dn, -bytesfreed); +} + +static void +free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ +#ifdef ZFS_DEBUG + int off, num; + int i, err, epbs; + uint64_t txg = tx->tx_txg; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + off = start - (db->db_blkid * 1<<epbs); + num = end - start + 1; + + ASSERT3U(off, >=, 0); + ASSERT3U(num, >=, 0); + ASSERT3U(db->db_level, >, 0); + ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); + ASSERT(db->db_blkptr != NULL); + + for (i = off; i < off+num; i++) { + uint64_t *buf; + int j; + dmu_buf_impl_t *child; + + ASSERT(db->db_level == 1); + + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + rw_exit(&db->db_dnode->dn_struct_rwlock); + if (err == ENOENT) + continue; + ASSERT(err == 0); + ASSERT(child->db_level == 0); + ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK])); + + /* db_data_old better be zeroed */ + if (child->db_d.db_data_old[txg & TXG_MASK]) { + buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data; + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + child, i, off, num); + } + } + } + + /* + * db_data better be zeroed unless it's dirty in a + * future txg. + */ + mutex_enter(&child->db_mtx); + buf = child->db.db_data; + if (buf != NULL && child->db_state != DB_FILL && + !list_link_active(&child->db_dirty_node + [(txg+1) & TXG_MASK]) && + !list_link_active(&child->db_dirty_node + [(txg+2) & TXG_MASK])) { + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + child, i, off, num); + } + } + } + mutex_exit(&child->db_mtx); + + dbuf_remove_ref(child, FTAG); + } +#endif +} + +static int +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, + dmu_tx_t *tx) +{ + dnode_t *dn = db->db_dnode; + blkptr_t *bp; + dmu_buf_impl_t *subdb; + uint64_t start, end, dbstart, dbend, i; + int epbs, shift, err; + int txg_index = tx->tx_txg&TXG_MASK; + int all = TRUE; + + dbuf_read(db); + arc_release(db->db_buf, db); + bp = (blkptr_t *)db->db.db_data; + + epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + shift = (db->db_level - 1) * epbs; + dbstart = db->db_blkid << epbs; + start = blkid >> shift; + if (dbstart < start) { + bp += start - dbstart; + all = FALSE; + } else { + start = dbstart; + } + dbend = ((db->db_blkid + 1) << epbs) - 1; + end = (blkid + nblks - 1) >> shift; + if (dbend <= end) + end = dbend; + else if (all) + all = trunc; + ASSERT3U(start, <=, end); + + if (db->db_level == 1) { + free_verify(db, start, end, tx); + free_blocks(dn, bp, end-start+1, tx); + ASSERT(all || list_link_active(&db->db_dirty_node[txg_index])); + return (all); + } + + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(subdb, blkid, nblks, trunc, tx)) { + ASSERT3P(subdb->db_blkptr, ==, bp); + free_blocks(dn, bp, 1, tx); + } + dbuf_remove_ref(subdb, FTAG); + } +#ifdef ZFS_DEBUG + bp -= (end-start)+1; + for (i = start; i <= end; i++, bp++) { + if (i == start && blkid != 0) + continue; + else if (i == end && !trunc) + continue; + ASSERT3U(bp->blk_birth, ==, 0); + } +#endif + ASSERT(all || list_link_active(&db->db_dirty_node[txg_index])); + return (all); +} + +/* + * free_range: Traverse the indicated range of the provided file + * and "free" all the blocks contained there. + */ +static void +dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +{ + blkptr_t *bp = dn->dn_phys->dn_blkptr; + dmu_buf_impl_t *db; + int trunc, start, end, shift, i, err; + int dnlevel = dn->dn_phys->dn_nlevels; + + if (blkid > dn->dn_phys->dn_maxblkid) + return; + + ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); + trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; + if (trunc) + nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + + /* There are no indirect blocks in the object */ + if (dnlevel == 1) { + if (blkid >= dn->dn_phys->dn_nblkptr) { + /* this range was never made persistent */ + return; + } + ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); + free_blocks(dn, bp + blkid, nblks, tx); + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH); + } + return; + } + + shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + start = blkid >> shift; + ASSERT(start < dn->dn_phys->dn_nblkptr); + end = (blkid + nblks - 1) >> shift; + bp += start; + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); + ASSERT3U(err, ==, 0); + rw_exit(&dn->dn_struct_rwlock); + + if (free_children(db, blkid, nblks, trunc, tx)) { + ASSERT3P(db->db_blkptr, ==, bp); + free_blocks(dn, bp, 1, tx); + } + dbuf_remove_ref(db, FTAG); + } + if (trunc) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH); + } +} + +static int +dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(dmu_tx_is_syncing(tx)); + + /* Undirty all buffers */ + while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) { + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(&dn->dn_dirty_dbufs[txgoff], db); + if (db->db_level == 0) { + ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf); + if (db->db_d.db_overridden_by[txgoff]) + dbuf_unoverride(db, tx->tx_txg); + db->db_d.db_data_old[txgoff] = NULL; + } + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg); + } + + ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + + /* Undirty next bits */ + dn->dn_next_nlevels[txgoff] = 0; + dn->dn_next_indblkshift[txgoff] = 0; + + /* free up all the blocks in the file. */ + dbuf_free_range(dn, 0, -1, tx); + dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); + ASSERT3U(dn->dn_phys->dn_secphys, ==, 0); + + /* + * All dbufs should be gone, since all holds are gone... + */ + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* ASSERT(blkptrs are zero); */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(dn->dn_type != DMU_OT_NONE); + + ASSERT(dn->dn_free_txg > 0); + if (dn->dn_allocated_txg != dn->dn_free_txg) + dbuf_will_dirty(dn->dn_dbuf, tx); + bzero(dn->dn_phys, sizeof (dnode_phys_t)); + + mutex_enter(&dn->dn_mtx); + dn->dn_type = DMU_OT_NONE; + dn->dn_dirtyblksz[txgoff] = 0; + dn->dn_maxblkid = 0; + dn->dn_allocated_txg = 0; + mutex_exit(&dn->dn_mtx); + + ASSERT(!IS_DNODE_DNODE(dn->dn_object)); + + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + /* + * Now that we've released our hold, the dnode may + * be evicted, so we musn't access it. + */ + return (1); +} + +/* + * Write out the dnode's dirty buffers at the specified level. + * This may create more dirty buffers at the next level up. + * + * NOTE: The dnode is kept in memory by being dirty. Once the + * dirty bit is cleared, it may be evicted. Beware of this! + */ +int +dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx) +{ + free_range_t *rp; + int txgoff = tx->tx_txg & TXG_MASK; + dnode_phys_t *dnp = dn->dn_phys; + + /* ASSERT(dn->dn_objset->dd_snapshot == NULL); */ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(IS_DNODE_DNODE(dn->dn_object) || + dn->dn_dirtyblksz[txgoff] > 0); + + ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + dnode_verify(dn); + /* + * Make sure the dbuf for the dn_phys is released before we modify it. + */ + if (dn->dn_dbuf) + arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf); + + mutex_enter(&dn->dn_mtx); + if (dn->dn_allocated_txg == tx->tx_txg) { + /* The dnode is newly allocated or reallocated */ + if (dnp->dn_type == DMU_OT_NONE) { + /* this is a first alloc, not a realloc */ + /* XXX shouldn't the phys already be zeroed? */ + bzero(dnp, DNODE_CORE_SIZE); + dnp->dn_datablkszsec = dn->dn_datablkszsec; + dnp->dn_indblkshift = dn->dn_indblkshift; + dnp->dn_nlevels = 1; + } + + if (dn->dn_nblkptr > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_nblkptr - dnp->dn_nblkptr)); + } + dnp->dn_type = dn->dn_type; + dnp->dn_bonustype = dn->dn_bonustype; + dnp->dn_bonuslen = dn->dn_bonuslen; + dnp->dn_nblkptr = dn->dn_nblkptr; + } + + if (dn->dn_dirtyblksz[txgoff]) { + ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff], + SPA_MINBLOCKSIZE) == 0); + dnp->dn_datablkszsec = + dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT; + } + + if (dn->dn_next_indblkshift[txgoff]) { + ASSERT(dnp->dn_nlevels == 1); + dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; + dn->dn_next_indblkshift[txgoff] = 0; + } + + /* + * Just take the live (open-context) values for checksum and compress. + * Strictly speaking it's a future leak, but nothing bad happens if we + * start using the new checksum or compress algorithm a little early. + */ + dnp->dn_checksum = dn->dn_checksum; + dnp->dn_compress = dn->dn_compress; + + mutex_exit(&dn->dn_mtx); + + /* process all the "freed" ranges in the file */ + if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { + for (rp = avl_first(&dn->dn_ranges[txgoff]); rp != NULL; + rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp)) + dnode_sync_free_range(dn, + rp->fr_blkid, rp->fr_nblks, tx); + } + mutex_enter(&dn->dn_mtx); + for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { + free_range_t *last = rp; + rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); + avl_remove(&dn->dn_ranges[txgoff], last); + kmem_free(last, sizeof (free_range_t)); + } + mutex_exit(&dn->dn_mtx); + + if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { + ASSERT3U(level, ==, 0); + return (dnode_sync_free(dn, tx)); + } + + if (dn->dn_next_nlevels[txgoff]) { + int new_lvl = dn->dn_next_nlevels[txgoff]; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + while (new_lvl > dnp->dn_nlevels) + dnode_increase_indirection(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dn->dn_next_nlevels[txgoff] = 0; + } + + if (level == dnp->dn_nlevels) { + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + + /* we've already synced out all data and indirect blocks */ + /* there are no more dirty dbufs under this dnode */ + ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL); + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg); + + /* XXX this is expensive. remove once 6343073 is closed. */ + /* NB: the "off < maxblkid" is to catch overflow */ + /* + * NB: if blocksize is changing, we could get confused, + * so only bother if there are multiple blocks and thus + * it can't be changing. + */ + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH); + + dn->dn_dirtyblksz[txgoff] = 0; + + + if (!IS_DNODE_DNODE(dn->dn_object)) { + dbuf_will_dirty(dn->dn_dbuf, tx); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } + + /* + * Now that we've dropped the reference, the dnode may + * be evicted, so we musn't access it. + */ + return (1); + } else { + dmu_buf_impl_t *db, *db_next; + list_t *list = &dn->dn_dirty_dbufs[txgoff]; + /* + * Iterate over the list, removing and sync'ing dbufs + * which are on the level we want, and leaving others. + */ + for (db = list_head(list); db; db = db_next) { + db_next = list_next(list, db); + if (db->db_level == level) { + list_remove(list, db); + dbuf_sync(db, zio, tx); + } + } + return (0); + } +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c new file mode 100644 index 0000000000..ab8dcfc3e3 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -0,0 +1,1463 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_traverse.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/unique.h> +#include <sys/zfs_context.h> + +#define DOS_REF_MAX (1ULL << 62) + +#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE + +#define BP_GET_UCSIZE(bp) \ + ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + +/* + * We use weighted reference counts to express the various forms of exclusion + * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open + * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE. + * This makes the exclusion logic simple: the total refcnt for all opens cannot + * exceed DOS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their + * weight (DOS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume + * just over half of the refcnt space, so there can't be more than one, but it + * can peacefully coexist with any number of STANDARD opens. + */ +static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = { + 0, /* DOS_MODE_NONE - invalid */ + 1, /* DOS_MODE_STANDARD - unlimited number */ + (DOS_REF_MAX >> 1) + 1, /* DOS_MODE_PRIMARY - only one of these */ + DOS_REF_MAX /* DOS_MODE_EXCLUSIVE - no other opens */ +}; + + +void +dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + int used = BP_GET_ASIZE(bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + dprintf_bp(bp, "born, ds=%p\n", ds); + + ASSERT(dmu_tx_is_syncing(tx)); + /* It could have been compressed away to nothing */ + if (BP_IS_HOLE(bp)) + return; + ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); + ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + if (ds == NULL) { + /* + * Account for the meta-objset space in its placeholder + * dsl_dir. + */ + ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + used, compressed, uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return; + } + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_lock); + ds->ds_phys->ds_used_bytes += used; + ds->ds_phys->ds_compressed_bytes += compressed; + ds->ds_phys->ds_uncompressed_bytes += uncompressed; + ds->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, + used, compressed, uncompressed, tx); +} + +void +dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + int used = BP_GET_ASIZE(bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + ASSERT(dmu_tx_is_syncing(tx)); + if (BP_IS_HOLE(bp)) + return; + + ASSERT(used > 0); + if (ds == NULL) { + /* + * Account for the meta-objset space in its placeholder + * dataset. + */ + /* XXX this can fail, what do we do when it does? */ + (void) arc_free(NULL, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, ARC_WAIT); + bzero(bp, sizeof (blkptr_t)); + + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + -used, -compressed, -uncompressed, tx); + dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + return; + } + ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + dprintf_bp(bp, "freeing: %s", ""); + /* XXX check return code? */ + (void) arc_free(NULL, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, ARC_WAIT); + + mutex_enter(&ds->ds_lock); + /* XXX unique_bytes is not accurate for head datasets */ + /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ + ds->ds_phys->ds_unique_bytes -= used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, + -used, -compressed, -uncompressed, tx); + } else { + dprintf_bp(bp, "putting on dead list: %s", ""); + bplist_enqueue(&ds->ds_deadlist, bp, tx); + /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + if (ds->ds_phys->ds_prev_snap_obj != 0) { + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); + if (ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object && + bp->blk_birth > + ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + ds->ds_prev->ds_phys->ds_unique_bytes += + used; + mutex_exit(&ds->ds_prev->ds_lock); + } + } + } + bzero(bp, sizeof (blkptr_t)); + mutex_enter(&ds->ds_lock); + ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); + ds->ds_phys->ds_used_bytes -= used; + ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); + ds->ds_phys->ds_compressed_bytes -= compressed; + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); + ds->ds_phys->ds_uncompressed_bytes -= uncompressed; + mutex_exit(&ds->ds_lock); +} + +int +dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx) +{ + uint64_t prev_snap_txg; + dsl_dir_t *dd; + /* ASSERT that it is not a snapshot */ + if (ds == NULL) + return (TRUE); + /* + * The snapshot creation could fail, but that would cause an + * incorrect FALSE return, which would only result in an + * overestimation of the amount of space that an operation would + * consume, which is OK. + * + * There's also a small window where we could miss a pending + * snapshot, because we could set the sync task in the quiescing + * phase. So this should only be used as a guess. + */ + dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + if (dd->dd_sync_func == dsl_dataset_snapshot_sync && + dd->dd_sync_txg < tx->tx_txg) + prev_snap_txg = dd->dd_sync_txg; + else + prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + mutex_exit(&dd->dd_lock); + return (blk_birth > prev_snap_txg); +} + +/* ARGSUSED */ +static void +dsl_dataset_evict(dmu_buf_t *db, void *dsv) +{ + dsl_dataset_t *ds = dsv; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* open_refcount == DOS_REF_MAX when deleting */ + ASSERT(ds->ds_open_refcount == 0 || + ds->ds_open_refcount == DOS_REF_MAX); + + dprintf_ds(ds, "evicting %s\n", ""); + + unique_remove(ds->ds_phys->ds_fsid_guid); + + if (ds->ds_user_ptr != NULL) + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + + if (ds->ds_prev) { + dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + ds->ds_prev = NULL; + } + + bplist_close(&ds->ds_deadlist); + dsl_dir_close(ds->ds_dir, ds); + + if (list_link_active(&ds->ds_synced_link)) + list_remove(&dp->dp_synced_objsets, ds); + + kmem_free(ds, sizeof (dsl_dataset_t)); +} + +static void +dsl_dataset_get_snapname(dsl_dataset_t *ds) +{ + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + if (ds->ds_snapname[0]) + return; + if (ds->ds_phys->ds_next_snap_obj == 0) + return; + + headdbuf = dmu_bonus_hold_tag(mos, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG); + dmu_buf_read(headdbuf); + headphys = headdbuf->db_data; + err = zap_value_search(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); + ASSERT(err == 0); + dmu_buf_rele_tag(headdbuf, FTAG); +} + +dsl_dataset_t * +dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, + int mode, void *tag) +{ + uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_t *ds; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + dbuf = dmu_bonus_hold_tag(mos, dsobj, tag); + dmu_buf_read(dbuf); + ds = dmu_buf_get_user(dbuf); + if (ds == NULL) { + dsl_dataset_t *winner; + + ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + ds->ds_dbuf = dbuf; + ds->ds_object = dsobj; + ds->ds_phys = dbuf->db_data; + ds->ds_dir = dsl_dir_open_obj(dp, + ds->ds_phys->ds_dir_obj, NULL, ds); + + bplist_open(&ds->ds_deadlist, + mos, ds->ds_phys->ds_deadlist_obj); + + if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { + ds->ds_snapname[0] = '\0'; + if (ds->ds_phys->ds_prev_snap_obj) { + ds->ds_prev = + dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, ds); + } + } else { + if (snapname) { +#ifdef ZFS_DEBUG + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos, + ds->ds_dir->dd_phys-> + dd_head_dataset_obj, FTAG); + dmu_buf_read(headdbuf); + headphys = headdbuf->db_data; + uint64_t foundobj; + err = zap_lookup(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, + snapname, sizeof (foundobj), 1, &foundobj); + ASSERT3U(err, ==, 0); + ASSERT3U(foundobj, ==, dsobj); + dmu_buf_rele_tag(headdbuf, FTAG); +#endif + (void) strcat(ds->ds_snapname, snapname); + } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { + dsl_dataset_get_snapname(ds); + } + } + + winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, + dsl_dataset_evict); + if (winner) { + bplist_close(&ds->ds_deadlist); + if (ds->ds_prev) { + dsl_dataset_close(ds->ds_prev, + DS_MODE_NONE, ds); + } + dsl_dir_close(ds->ds_dir, ds); + kmem_free(ds, sizeof (dsl_dataset_t)); + ds = winner; + } else { + uint64_t new = + unique_insert(ds->ds_phys->ds_fsid_guid); + if (new != ds->ds_phys->ds_fsid_guid) { + /* XXX it won't necessarily be synced... */ + ds->ds_phys->ds_fsid_guid = new; + } + } + } + ASSERT3P(ds->ds_dbuf, ==, dbuf); + ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + + mutex_enter(&ds->ds_lock); + if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY && + ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) || + (ds->ds_open_refcount + weight > DOS_REF_MAX)) { + mutex_exit(&ds->ds_lock); + dsl_dataset_close(ds, DS_MODE_NONE, tag); + return (NULL); + } + ds->ds_open_refcount += weight; + mutex_exit(&ds->ds_lock); + + return (ds); +} + +int +dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, + void *tag, dsl_dataset_t **dsp) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + const char *tail; + uint64_t obj; + dsl_dataset_t *ds = NULL; + int err = 0; + + dd = dsl_dir_open_spa(spa, name, FTAG, &tail); + if (dd == NULL) + return (ENOENT); + + dp = dd->dd_pool; + obj = dd->dd_phys->dd_head_dataset_obj; + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (obj == 0) { + /* A dataset with no associated objset */ + err = ENOENT; + goto out; + } + + if (tail != NULL) { + objset_t *mos = dp->dp_meta_objset; + + ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag); + obj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_close(ds, DS_MODE_NONE, tag); + ds = NULL; + + if (tail[0] != '@') { + err = ENOENT; + goto out; + } + tail++; + + /* Look for a snapshot */ + if (!DS_MODE_IS_READONLY(mode)) { + err = EROFS; + goto out; + } + dprintf("looking for snapshot '%s'\n", tail); + err = zap_lookup(mos, obj, tail, 8, 1, &obj); + if (err) + goto out; + } + ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag); + if (ds == NULL) + err = EBUSY; + +out: + rw_exit(&dp->dp_config_rwlock); + dsl_dir_close(dd, FTAG); + + ASSERT3U((err == 0), ==, (ds != NULL)); + /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */ + + *dsp = ds; + return (err); +} + +int +dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp)); +} + +void +dsl_dataset_name(dsl_dataset_t *ds, char *name) +{ + if (ds == NULL) { + (void) strcpy(name, "mos"); + } else { + dsl_dir_name(ds->ds_dir, name); + dsl_dataset_get_snapname(ds); + if (ds->ds_snapname[0]) { + (void) strcat(name, "@"); + if (!MUTEX_HELD(&ds->ds_lock)) { + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ + mutex_enter(&ds->ds_lock); + (void) strcat(name, ds->ds_snapname); + mutex_exit(&ds->ds_lock); + } else { + (void) strcat(name, ds->ds_snapname); + } + } + } +} + +void +dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) +{ + uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + mutex_enter(&ds->ds_lock); + ASSERT3U(ds->ds_open_refcount, >=, weight); + ds->ds_open_refcount -= weight; + dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n", + mode, ds->ds_open_refcount); + mutex_exit(&ds->ds_lock); + + dmu_buf_rele_tag(ds->ds_dbuf, tag); +} + +void +dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + dsl_dataset_t *ds; + uint64_t dsobj; + dsl_dir_t *dd; + + dsl_dir_create_root(mos, ddobjp, tx); + dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG); + ASSERT(dd != NULL); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0, + DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx); + dbuf = dmu_bonus_hold(mos, dsobj); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_fsid_guid = unique_create(); + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + dmu_buf_rele(dbuf); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_close(dd, FTAG); + + ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG); + (void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); +} + +int +dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname, + const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx) +{ + int err; + dsl_pool_t *dp = pds->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj; + objset_t *mos = dp->dp_meta_objset; + dsl_dir_t *dd; + + if (clone_parent != NULL) { + /* + * You can't clone across pools. + */ + if (clone_parent->ds_dir->dd_pool != dp) + return (EXDEV); + + /* + * You can only clone snapshots, not the head datasets. + */ + if (clone_parent->ds_phys->ds_num_children == 0) + return (EINVAL); + } + + ASSERT(lastname[0] != '@'); + ASSERT(dmu_tx_is_syncing(tx)); + + err = dsl_dir_create_sync(pds, lastname, tx); + if (err) + return (err); + dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL); + ASSERT(dd != NULL); + + /* This is the point of no (unsuccessful) return */ + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0, + DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx); + dbuf = dmu_bonus_hold(mos, dsobj); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + if (clone_parent) { + dsphys->ds_prev_snap_obj = clone_parent->ds_object; + dsphys->ds_prev_snap_txg = + clone_parent->ds_phys->ds_creation_txg; + dsphys->ds_used_bytes = + clone_parent->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = + clone_parent->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + clone_parent->ds_phys->ds_uncompressed_bytes; + dsphys->ds_bp = clone_parent->ds_phys->ds_bp; + + dmu_buf_will_dirty(clone_parent->ds_dbuf, tx); + clone_parent->ds_phys->ds_num_children++; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; + } + dmu_buf_rele(dbuf); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_close(dd, FTAG); + + return (0); +} + + +int +dsl_dataset_destroy(const char *name) +{ + int err; + dsl_pool_t *dp; + dsl_dir_t *dd; + const char *tail; + + dd = dsl_dir_open(name, FTAG, &tail); + if (dd == NULL) + return (ENOENT); + + dp = dd->dd_pool; + if (tail != NULL) { + if (tail[0] != '@') { + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + tail++; + /* Just blow away the snapshot */ + do { + txg_wait_synced(dp, 0); + err = dsl_dir_sync_task(dd, + dsl_dataset_destroy_sync, (void*)tail, 0); + } while (err == EAGAIN); + dsl_dir_close(dd, FTAG); + } else { + char buf[MAXNAMELEN]; + char *cp; + + dsl_dir_t *pds; + if (dd->dd_phys->dd_parent_obj == 0) { + dsl_dir_close(dd, FTAG); + return (EINVAL); + } + /* + * Make sure it's not dirty before we destroy it. + */ + txg_wait_synced(dd->dd_pool, 0); + /* + * Blow away the dsl_dir + head dataset. + * dsl_dir_destroy_sync() will call + * dsl_dataset_destroy_sync() to destroy the head dataset. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + pds = dsl_dir_open_obj(dd->dd_pool, + dd->dd_phys->dd_parent_obj, NULL, FTAG); + dsl_dir_close(dd, FTAG); + rw_exit(&dp->dp_config_rwlock); + + (void) strcpy(buf, name); + cp = strrchr(buf, '/') + 1; + ASSERT(cp[0] != '\0'); + do { + txg_wait_synced(dp, 0); + err = dsl_dir_sync_task(pds, + dsl_dir_destroy_sync, cp, 0); + } while (err == EAGAIN); + dsl_dir_close(pds, FTAG); + } + + return (err); +} + +int +dsl_dataset_rollback(const char *name) +{ + int err; + dsl_dir_t *dd; + const char *tail; + + dd = dsl_dir_open(name, FTAG, &tail); + if (dd == NULL) + return (ENOENT); + + if (tail != NULL) { + dsl_dir_close(dd, FTAG); + return (EINVAL); + } + do { + txg_wait_synced(dd->dd_pool, 0); + err = dsl_dir_sync_task(dd, + dsl_dataset_rollback_sync, NULL, 0); + } while (err == EAGAIN); + dsl_dir_close(dd, FTAG); + + return (err); +} + +void * +dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func) +{ + void *old; + + mutex_enter(&ds->ds_lock); + old = ds->ds_user_ptr; + if (old == NULL) { + ds->ds_user_ptr = p; + ds->ds_user_evict_func = func; + } + mutex_exit(&ds->ds_lock); + return (old); +} + +void * +dsl_dataset_get_user_ptr(dsl_dataset_t *ds) +{ + return (ds->ds_user_ptr); +} + + +void +dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp) +{ + *bp = ds->ds_phys->ds_bp; +} + +void +dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + /* If it's the meta-objset, set dp_meta_rootbp */ + if (ds == NULL) { + tx->tx_pool->dp_meta_rootbp = *bp; + } else { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_bp = *bp; + } +} + +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp; + + if (ds == NULL) /* this is the meta-objset */ + return; + + ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + + dp = ds->ds_dir->dd_pool; + + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + } +} + +struct killarg { + uint64_t *usedp; + uint64_t *compressedp; + uint64_t *uncompressedp; + zio_t *zio; + dmu_tx_t *tx; +}; + +static int +kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + struct killarg *ka = arg; + blkptr_t *bp = &bc->bc_blkptr; + + ASSERT3U(bc->bc_errno, ==, 0); + + /* + * Since this callback is not called concurrently, no lock is + * needed on the accounting values. + */ + *ka->usedp += BP_GET_ASIZE(bp); + *ka->compressedp += BP_GET_PSIZE(bp); + *ka->uncompressedp += BP_GET_UCSIZE(bp); + /* XXX check for EIO? */ + (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL, + ARC_NOWAIT); + return (0); +} + +/* ARGSUSED */ +int +dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_t *ds; + + if (dd->dd_phys->dd_head_dataset_obj == 0) + return (EINVAL); + ds = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); + + if (ds->ds_phys->ds_prev_snap_txg == 0) { + /* + * There's no previous snapshot. I suppose we could + * roll it back to being empty (and re-initialize the + * upper (ZPL) layer). But for now there's no way to do + * this via the user interface. + */ + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (EINVAL); + } + + mutex_enter(&ds->ds_lock); + if (ds->ds_open_refcount > 0) { + mutex_exit(&ds->ds_lock); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (EBUSY); + } + + /* + * If we made changes this txg, traverse_dsl_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) { + mutex_exit(&ds->ds_lock); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (EAGAIN); + } + + /* THE POINT OF NO (unsuccessful) RETURN */ + ds->ds_open_refcount = DOS_REF_MAX; + mutex_exit(&ds->ds_lock); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + /* Zero out the deadlist. */ + dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj); + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj); + + { + /* Free blkptrs that we gave birth to */ + zio_t *zio; + uint64_t used = 0, compressed = 0, uncompressed = 0; + struct killarg ka; + + zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED); + ka.usedp = &used; + ka.compressedp = &compressed; + ka.uncompressedp = &uncompressed; + ka.zio = zio; + ka.tx = tx; + (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + ADVANCE_POST, kill_blkptr, &ka); + (void) zio_wait(zio); + + dsl_dir_diduse_space(dd, + -used, -compressed, -uncompressed, tx); + } + + /* Change our contents to that of the prev snapshot (finally!) */ + ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; + ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; + ds->ds_phys->ds_compressed_bytes = + ds->ds_prev->ds_phys->ds_compressed_bytes; + ds->ds_phys->ds_uncompressed_bytes = + ds->ds_prev->ds_phys->ds_uncompressed_bytes; + ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring; + ds->ds_phys->ds_unique_bytes = 0; + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_unique_bytes = 0; + + dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj); + ds->ds_open_refcount = 0; + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + + return (0); +} + +int +dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + const char *snapname = arg; + uint64_t used = 0, compressed = 0, uncompressed = 0; + blkptr_t bp; + zio_t *zio; + int err; + int after_branch_point = FALSE; + int drop_lock = FALSE; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds, *ds_prev = NULL; + uint64_t obj; + + if (dd->dd_phys->dd_head_dataset_obj == 0) + return (EINVAL); + + if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) { + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + drop_lock = TRUE; + } + + ds = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, + snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG); + + if (snapname) { + err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &obj); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + if (err) { + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (err); + } + + ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL, + DS_MODE_EXCLUSIVE, FTAG); + } + if (ds == NULL) { + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (EBUSY); + } + + obj = ds->ds_object; + + /* Can't delete a branch point. */ + if (ds->ds_phys->ds_num_children > 1) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (EINVAL); + } + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == obj) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (EINVAL); + } + + /* + * If we made changes this txg, traverse_dsl_dataset won't find + * them. Try again. + */ + if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) { + mutex_exit(&ds->ds_lock); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (EAGAIN); + } + + /* THE POINT OF NO (unsuccessful) RETURN */ + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + if (ds->ds_prev) { + ds_prev = ds->ds_prev; + } else { + ds_prev = dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, FTAG); + } + after_branch_point = + (ds_prev->ds_phys->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + ds->ds_phys->ds_next_snap_obj == 0) { + /* This clone is toast. */ + ASSERT(ds_prev->ds_phys->ds_num_children > 1); + ds_prev->ds_phys->ds_num_children--; + } else if (!after_branch_point) { + ds_prev->ds_phys->ds_next_snap_obj = + ds->ds_phys->ds_next_snap_obj; + } + } + + ASSERT3P(tx->tx_pool, ==, dd->dd_pool); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + + if (ds->ds_phys->ds_next_snap_obj != 0) { + dsl_dataset_t *ds_next; + uint64_t itor = 0; + + spa_scrub_restart(dp->dp_spa, tx->tx_txg); + + ds_next = dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG); + ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + ds_next->ds_phys->ds_prev_snap_obj = + ds->ds_phys->ds_prev_snap_obj; + ds_next->ds_phys->ds_prev_snap_txg = + ds->ds_phys->ds_prev_snap_txg; + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + + /* + * Transfer to our deadlist (which will become next's + * new deadlist) any entries from next's current + * deadlist which were born before prev, and free the + * other entries. + * + * XXX we're doing this long task with the config lock held + */ + while (bplist_iterate(&ds_next->ds_deadlist, &itor, + &bp) == 0) { + if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { + bplist_enqueue(&ds->ds_deadlist, &bp, tx); + if (ds_prev && !after_branch_point && + bp.blk_birth > + ds_prev->ds_phys->ds_prev_snap_txg) { + ds_prev->ds_phys->ds_unique_bytes += + BP_GET_ASIZE(&bp); + } + } else { + used += BP_GET_ASIZE(&bp); + compressed += BP_GET_PSIZE(&bp); + uncompressed += BP_GET_UCSIZE(&bp); + /* XXX check return value? */ + (void) arc_free(zio, dp->dp_spa, tx->tx_txg, + &bp, NULL, NULL, ARC_NOWAIT); + } + } + + /* free next's deadlist */ + bplist_close(&ds_next->ds_deadlist); + bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); + + /* set next's deadlist to our deadlist */ + ds_next->ds_phys->ds_deadlist_obj = + ds->ds_phys->ds_deadlist_obj; + bplist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj); + ds->ds_phys->ds_deadlist_obj = 0; + + if (ds_next->ds_phys->ds_next_snap_obj != 0) { + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + * + * XXX we're doing this long task with the + * config lock held + */ + dsl_dataset_t *ds_after_next; + + ds_after_next = dsl_dataset_open_obj(dd->dd_pool, + ds_next->ds_phys->ds_next_snap_obj, NULL, + DS_MODE_NONE, FTAG); + itor = 0; + while (bplist_iterate(&ds_after_next->ds_deadlist, + &itor, &bp) == 0) { + if (bp.blk_birth > + ds->ds_phys->ds_prev_snap_txg && + bp.blk_birth <= + ds->ds_phys->ds_creation_txg) { + ds_next->ds_phys->ds_unique_bytes += + BP_GET_ASIZE(&bp); + } + } + + dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + } else { + /* + * It would be nice to update the head dataset's + * unique. To do so we would have to traverse + * it for blocks born after ds_prev, which is + * pretty expensive just to maintain something + * for debugging purposes. + */ + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, + ds_next); + if (ds_prev) { + ds_next->ds_prev = dsl_dataset_open_obj( + dd->dd_pool, ds->ds_phys->ds_prev_snap_obj, + NULL, DS_MODE_NONE, ds_next); + } else { + ds_next->ds_prev = NULL; + } + } + dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); + + /* + * NB: unique_bytes is not accurate for head objsets + * because we don't update it when we delete the most + * recent snapshot -- see above comment. + */ + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + } else { + /* + * There's no next snapshot, so this is a head dataset. + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + struct killarg ka; + + ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); + bplist_close(&ds->ds_deadlist); + bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * XXX we're doing this long task with the config lock held + */ + ka.usedp = &used; + ka.compressedp = &compressed; + ka.uncompressedp = &uncompressed; + ka.zio = zio; + ka.tx = tx; + err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + ADVANCE_POST, kill_blkptr, &ka); + ASSERT3U(err, ==, 0); + } + + err = zio_wait(zio); + ASSERT3U(err, ==, 0); + + dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx); + + if (ds->ds_phys->ds_snapnames_zapobj) { + err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); + ASSERT(err == 0); + } + + if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) { + /* Erase the link in the dataset */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_head_dataset_obj = 0; + /* + * dsl_dir_sync_destroy() called us, they'll destroy + * the dataset. + */ + } else { + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ds_head = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); +#ifdef ZFS_DEBUG + { + uint64_t val; + err = zap_lookup(mos, + ds_head->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &val); + ASSERT3U(err, ==, 0); + ASSERT3U(val, ==, obj); + } +#endif + err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj, + snapname, tx); + ASSERT(err == 0); + dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG); + } + + if (ds_prev && ds->ds_prev != ds_prev) + dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); + + err = dmu_object_free(mos, obj, tx); + ASSERT(err == 0); + + /* + * Close the objset with mode NONE, thus leaving it with + * DOS_REF_MAX set, so that noone can access it. + */ + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (0); +} + +int +dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + const char *snapname = arg; + dsl_pool_t *dp = dd->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, value; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds; + int err; + + ASSERT(dmu_tx_is_syncing(tx)); + + if (dd->dd_phys->dd_head_dataset_obj == 0) + return (EINVAL); + ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL, + DS_MODE_NONE, FTAG); + + err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &value); + if (err == 0) { + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (EEXIST); + } + ASSERT(err == ENOENT); + + /* The point of no (unsuccessful) return */ + + dprintf_dd(dd, "taking snapshot %s in txg %llu\n", + snapname, tx->tx_txg); + + spa_scrub_restart(dp->dp_spa, tx->tx_txg); + + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0, + DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx); + dbuf = dmu_bonus_hold(mos, dsobj); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + dsphys->ds_next_snap_obj = ds->ds_object; + dsphys->ds_num_children = 1; + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; + dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; + dsphys->ds_restoring = ds->ds_phys->ds_restoring; + dsphys->ds_bp = ds->ds_phys->ds_bp; + dmu_buf_rele(dbuf); + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *ds_prev; + + ds_prev = dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG); + ASSERT(ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object || + ds_prev->ds_phys->ds_num_children > 1); + if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev->ds_phys->ds_creation_txg); + ds_prev->ds_phys->ds_next_snap_obj = dsobj; + } + dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); + } else { + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0); + } + + bplist_close(&ds->ds_deadlist); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); + ds->ds_phys->ds_prev_snap_obj = dsobj; + ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; + ds->ds_phys->ds_unique_bytes = 0; + ds->ds_phys->ds_deadlist_obj = + bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + + dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); + err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx); + ASSERT(err == 0); + + if (ds->ds_prev) + dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + ds->ds_prev = dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds); + + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + + return (0); +} + +void +dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + + dmu_objset_sync(ds->ds_user_ptr, tx); + dsl_dir_dirty(ds->ds_dir, tx); + bplist_close(&ds->ds_deadlist); + + dmu_buf_remove_ref(ds->ds_dbuf, ds); +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds) +{ + /* fill in properties crap */ + dsl_dir_stats(ds->ds_dir, dds); + + if (ds->ds_phys->ds_num_children != 0) { + dds->dds_is_snapshot = TRUE; + dds->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } + + dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth; + + dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill; + dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used; + + /* We override the dataset's creation time... they should be the same */ + dds->dds_creation_time = ds->ds_phys->ds_creation_time; + dds->dds_creation_txg = ds->ds_phys->ds_creation_txg; + dds->dds_space_refd = ds->ds_phys->ds_used_bytes; + dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid; + dds->dds_guid = ds->ds_phys->ds_guid; + + if (ds->ds_phys->ds_next_snap_obj) { + /* + * This is a snapshot; override the dd's space used with + * our unique space + */ + dds->dds_space_used = ds->ds_phys->ds_unique_bytes; + dds->dds_compressed_bytes = + ds->ds_phys->ds_compressed_bytes; + dds->dds_uncompressed_bytes = + ds->ds_phys->ds_uncompressed_bytes; + } + + dds->dds_objset_obj = ds->ds_object; +} + +dsl_pool_t * +dsl_dataset_pool(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool); +} + +struct osrenamearg { + const char *oldname; + const char *newname; +}; + +static int +dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct osrenamearg *ora = arg; + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dir_t *nds; + const char *tail; + int err; + dsl_dataset_t *snds, *fsds; + uint64_t val; + + err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname, + DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds); + if (err) + return (err); + + if (snds->ds_dir != dd) { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (EINVAL); + } + + /* better be changing a snapshot */ + if (snds->ds_phys->ds_next_snap_obj == 0) { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (EINVAL); + } + + /* new fs better exist */ + nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail); + if (nds == NULL) { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (ENOENT); + } + + dsl_dir_close(nds, FTAG); + + /* new name better be in same fs */ + if (nds != dd) { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (EINVAL); + } + + /* new name better be a snapshot */ + if (tail == NULL || tail[0] != '@') { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (EINVAL); + } + + tail++; + + fsds = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); + + /* new name better not be in use */ + err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj, + tail, 8, 1, &val); + if (err != ENOENT) { + if (err == 0) + err = EEXIST; + dsl_dataset_close(fsds, DS_MODE_NONE, FTAG); + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (EEXIST); + } + + /* The point of no (unsuccessful) return */ + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER); + dsl_dataset_get_snapname(snds); + err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj, + snds->ds_snapname, tx); + ASSERT3U(err, ==, 0); + mutex_enter(&snds->ds_lock); + (void) strcpy(snds->ds_snapname, tail); + mutex_exit(&snds->ds_lock); + err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj, + snds->ds_snapname, 8, 1, &snds->ds_object, tx); + ASSERT3U(err, ==, 0); + rw_exit(&dd->dd_pool->dp_config_rwlock); + + dsl_dataset_close(fsds, DS_MODE_NONE, FTAG); + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (0); +} + +#pragma weak dmu_objset_rename = dsl_dataset_rename +int +dsl_dataset_rename(const char *osname, const char *newname) +{ + dsl_dir_t *dd; + const char *tail; + struct osrenamearg ora; + int err; + + dd = dsl_dir_open(osname, FTAG, &tail); + if (dd == NULL) + return (ENOENT); + if (tail == NULL) { + err = dsl_dir_sync_task(dd, + dsl_dir_rename_sync, (void*)newname, 1<<12); + dsl_dir_close(dd, FTAG); + return (err); + } + if (tail[0] != '@') { + /* the name ended in a nonexistant component */ + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + + ora.oldname = osname; + ora.newname = newname; + + err = dsl_dir_sync_task(dd, + dsl_dataset_snapshot_rename_sync, &ora, 1<<12); + dsl_dir_close(dd, FTAG); + return (err); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c new file mode 100644 index 0000000000..3b0d32de70 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -0,0 +1,1217 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include "zfs_namecheck.h" + +static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd); +static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); +static int dsl_dir_set_reservation_sync(dsl_dir_t *dd, + void *arg, dmu_tx_t *tx); +static uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); + + +/* ARGSUSED */ +static void +dsl_dir_evict(dmu_buf_t *db, void *arg) +{ + dsl_dir_t *dd = arg; + dsl_pool_t *dp = dd->dd_pool; + int t; + + for (t = 0; t < TXG_SIZE; t++) { + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + ASSERT(dd->dd_tempreserved[t] == 0); + ASSERT(dd->dd_space_towrite[t] == 0); + } + + ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); + + ASSERT(dd->dd_sync_txg == 0); + + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + + spa_close(dd->dd_pool->dp_spa, dd); + + /* + * The props callback list should be empty since they hold the + * dir open. + */ + list_destroy(&dd->dd_prop_cbs); + kmem_free(dd, sizeof (dsl_dir_t)); +} + +dsl_dir_t * +dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag) +{ + dmu_buf_t *dbuf; + dsl_dir_t *dd; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + + dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag); + dmu_buf_read(dbuf); + dd = dmu_buf_get_user(dbuf); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET); + } +#endif + /* XXX assert bonus buffer size is correct */ + if (dd == NULL) { + dsl_dir_t *winner; + int err; + + dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); + dd->dd_object = ddobj; + dd->dd_dbuf = dbuf; + dd->dd_pool = dp; + dd->dd_phys = dbuf->db_data; + dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; + + list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_node)); + + if (dd->dd_phys->dd_parent_obj) { + dd->dd_parent = dsl_dir_open_obj(dp, + dd->dd_phys->dd_parent_obj, NULL, dd); + if (tail) { +#ifdef ZFS_DEBUG + uint64_t foundobj; + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_parent->dd_phys-> + dd_child_dir_zapobj, + tail, sizeof (foundobj), 1, &foundobj); + ASSERT3U(err, ==, 0); + ASSERT3U(foundobj, ==, ddobj); +#endif + (void) strcpy(dd->dd_myname, tail); + } else { + err = zap_value_search(dp->dp_meta_objset, + dd->dd_parent->dd_phys-> + dd_child_dir_zapobj, + ddobj, dd->dd_myname); + /* + * The caller should be protecting this ddobj + * from being deleted concurrently + */ + ASSERT(err == 0); + } + } else { + (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); + } + + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, + dsl_dir_evict); + if (winner) { + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + kmem_free(dd, sizeof (dsl_dir_t)); + dd = winner; + } else { + spa_open_ref(dp->dp_spa, dd); + } + } + + /* + * The dsl_dir_t has both open-to-close and instantiate-to-evict + * holds on the spa. We need the open-to-close holds because + * otherwise the spa_refcnt wouldn't change when we open a + * dir which the spa also has open, so we could incorrectly + * think it was OK to unload/export/destroy the pool. We need + * the instantiate-to-evict hold because the dsl_dir_t has a + * pointer to the dd_pool, which has a pointer to the spa_t. + */ + spa_open_ref(dp->dp_spa, tag); + ASSERT3P(dd->dd_pool, ==, dp); + ASSERT3U(dd->dd_object, ==, ddobj); + ASSERT3P(dd->dd_dbuf, ==, dbuf); + return (dd); +} + +void +dsl_dir_close(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele_tag(dd->dd_dbuf, tag); +} + +/* buf must be long enough (MAXNAMELEN should do) */ +void +dsl_dir_name(dsl_dir_t *dd, char *buf) +{ + if (dd->dd_parent) { + dsl_dir_name(dd->dd_parent, buf); + (void) strcat(buf, "/"); + } else { + buf[0] = '\0'; + } + if (!MUTEX_HELD(&dd->dd_lock)) { + /* + * recursive mutex so that we can use + * dprintf_dd() with dd_lock held + */ + mutex_enter(&dd->dd_lock); + (void) strcat(buf, dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + (void) strcat(buf, dd->dd_myname); + } +} + +int +dsl_dir_is_private(dsl_dir_t *dd) +{ + int rv = FALSE; + + if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) + rv = TRUE; + if (dataset_name_hidden(dd->dd_myname)) + rv = TRUE; + return (rv); +} + + +static int +getcomponent(const char *path, char *component, const char **nextp) +{ + char *p; + if (path == NULL) + return (NULL); + /* This would be a good place to reserve some namespace... */ + p = strpbrk(path, "/@"); + if (p && (p[1] == '/' || p[1] == '@')) { + /* two separators in a row */ + return (EINVAL); + } + if (p == NULL || p == path) { + /* + * if the first thing is an @ or /, it had better be an + * @ and it had better not have any more ats or slashes, + * and it had better have something after the @. + */ + if (p != NULL && + (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) + return (EINVAL); + if (strlen(path) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(component, path); + p = NULL; + } else if (p[0] == '/') { + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + p++; + } else if (p[0] == '@') { + /* + * if the next separator is an @, there better not be + * any more slashes. + */ + if (strchr(path, '/')) + return (EINVAL); + if (p-path >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(component, path, p - path); + component[p-path] = '\0'; + } else { + ASSERT(!"invalid p"); + } + *nextp = p; + return (0); +} + +/* + * same as dsl_open_dir, ignore the first component of name and use the + * spa instead + */ +dsl_dir_t * +dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) +{ + char buf[MAXNAMELEN]; + const char *next, *nextnext = NULL; + int err; + dsl_dir_t *dd; + dsl_pool_t *dp; + uint64_t ddobj; + int openedspa = FALSE; + + dprintf("%s\n", name); + + if (name == NULL) + return (NULL); + err = getcomponent(name, buf, &next); + if (err) + return (NULL); + if (spa == NULL) { + err = spa_open(buf, &spa, FTAG); + if (err) { + dprintf("spa_open(%s) failed\n", buf); + return (NULL); + } + openedspa = TRUE; + + /* XXX this assertion belongs in spa_open */ + ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); + } + + dp = spa_get_dsl(spa); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag); + while (next != NULL) { + dsl_dir_t *child_ds; + err = getcomponent(next, buf, &nextnext); + if (err) { + dsl_dir_close(dd, tag); + if (openedspa) + spa_close(spa, FTAG); + return (NULL); + } + ASSERT(next[0] != '\0'); + if (next[0] == '@') + break; + if (dd->dd_phys->dd_child_dir_zapobj == 0) + break; + dprintf("looking up %s in obj%lld\n", + buf, dd->dd_phys->dd_child_dir_zapobj); + + err = zap_lookup(dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, + buf, sizeof (ddobj), 1, &ddobj); + if (err == ENOENT) { + break; + } + ASSERT(err == 0); + + child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag); + dsl_dir_close(dd, tag); + dd = child_ds; + next = nextnext; + } + rw_exit(&dp->dp_config_rwlock); + + /* + * It's an error if there's more than one component left, or + * tailp==NULL and there's any component left. + */ + if (next != NULL && + (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { + /* bad path name */ + dsl_dir_close(dd, tag); + dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); + next = NULL; + dd = NULL; + } + if (tailp) + *tailp = next; + if (openedspa) + spa_close(spa, FTAG); + return (dd); +} + +/* + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. Return NULL if the path is bogus, or if + * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' + * means that the last component is a snapshot. + */ +dsl_dir_t * +dsl_dir_open(const char *name, void *tag, const char **tailp) +{ + return (dsl_dir_open_spa(NULL, name, tag, tailp)); +} + +int +dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) +{ + objset_t *mos = pds->dd_pool->dp_meta_objset; + uint64_t ddobj; + dsl_dir_phys_t *dsphys; + dmu_buf_t *dbuf; + int err; + + ASSERT(dmu_tx_is_syncing(tx)); + + if (pds->dd_phys->dd_child_dir_zapobj == 0) { + dmu_buf_will_dirty(pds->dd_dbuf, tx); + pds->dd_phys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); + } + + rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER); + err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj); + if (err != ENOENT) { + rw_exit(&pds->dd_pool->dp_config_rwlock); + return (err ? err : EEXIST); + } + + ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx); + err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx); + ASSERT3U(err, ==, 0); + dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n", + name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err); + + dbuf = dmu_bonus_hold(mos, ddobj); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + + dsphys->dd_creation_time = gethrestime_sec(); + dsphys->dd_parent_obj = pds->dd_object; + dsphys->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + dsphys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); + dmu_buf_rele(dbuf); + + rw_exit(&pds->dd_pool->dp_config_rwlock); + + return (0); +} + +int +dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx) +{ + const char *name = arg; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = pds->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t val, obj, child_zapobj, props_zapobj; + int t, err; + + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + + err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name, + 8, 1, &obj); + if (err) + goto out; + + dd = dsl_dir_open_obj(dp, obj, name, FTAG); + ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object); + + if (dmu_buf_refcount(dd->dd_dbuf) > 1) { + err = EBUSY; + goto out; + } + + for (t = 0; t < TXG_SIZE; t++) { + /* + * if they were dirty, they'd also be open. + * dp_config_rwlock ensures that it stays that way. + */ + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + } + + child_zapobj = dd->dd_phys->dd_child_dir_zapobj; + props_zapobj = dd->dd_phys->dd_props_zapobj; + + if (child_zapobj != 0) { + uint64_t count; + err = EEXIST; + (void) zap_count(mos, child_zapobj, &count); + if (count != 0) + goto out; + } + + if (dd->dd_phys->dd_head_dataset_obj != 0) { + err = dsl_dataset_destroy_sync(dd, NULL, tx); + if (err) + goto out; + } + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + + /* The point of no (unsuccessful) return */ + + /* Make sure parent's used gets updated */ + val = 0; + err = dsl_dir_set_reservation_sync(dd, &val, tx); + ASSERT(err == 0); + ASSERT3U(dd->dd_used_bytes, ==, 0); + ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + dsl_dir_close(dd, FTAG); + dd = NULL; + + err = dmu_object_free(mos, obj, tx); + ASSERT(err == 0); + + if (child_zapobj) + err = zap_destroy(mos, child_zapobj, tx); + ASSERT(err == 0); + + if (props_zapobj) + err = zap_destroy(mos, props_zapobj, tx); + ASSERT(err == 0); + + err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx); + ASSERT(err == 0); + +out: + rw_exit(&dp->dp_config_rwlock); + if (dd) + dsl_dir_close(dd, FTAG); + + return (err); +} + +void +dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) +{ + dsl_dir_phys_t *dsp; + dmu_buf_t *dbuf; + int error; + + *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx); + + error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, + sizeof (uint64_t), 1, ddobjp, tx); + ASSERT3U(error, ==, 0); + + dbuf = dmu_bonus_hold(mos, *ddobjp); + dmu_buf_will_dirty(dbuf, tx); + dsp = dbuf->db_data; + + dsp->dd_creation_time = gethrestime_sec(); + dsp->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + dsp->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx); + + dmu_buf_rele(dbuf); +} + +void +dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) +{ + bzero(dds, sizeof (dmu_objset_stats_t)); + + dds->dds_dir_obj = dd->dd_object; + dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE); + + mutex_enter(&dd->dd_lock); + dds->dds_space_used = dd->dd_used_bytes; + dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes; + dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes; + dds->dds_quota = dd->dd_phys->dd_quota; + dds->dds_reserved = dd->dd_phys->dd_reserved; + mutex_exit(&dd->dd_lock); + + dds->dds_creation_time = dd->dd_phys->dd_creation_time; + + dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0); + + if (dd->dd_phys->dd_clone_parent_obj) { + dsl_dataset_t *ds; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + ds = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG); + dsl_dataset_name(ds, dds->dds_clone_of); + dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj; + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + rw_exit(&dd->dd_pool->dp_config_rwlock); + } + + VERIFY(dsl_prop_get_ds_integer(dd, "checksum", + &dds->dds_checksum, dds->dds_checksum_setpoint) == 0); + + VERIFY(dsl_prop_get_ds_integer(dd, "compression", + &dds->dds_compression, dds->dds_compression_setpoint) == 0); + + VERIFY(dsl_prop_get_ds_integer(dd, "zoned", + &dds->dds_zoned, dds->dds_zoned_setpoint) == 0); + + spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot, + sizeof (dds->dds_altroot)); +} + +int +dsl_dir_sync_task(dsl_dir_t *dd, + int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space) +{ + dmu_tx_t *tx; + dsl_pool_t *dp = dd->dd_pool; + int err = 0; + uint64_t txg; + + dprintf_dd(dd, "func=%p space=%llu\n", func, space); + +again: + tx = dmu_tx_create_ds(dd); + dmu_tx_hold_space(tx, space); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == ENOSPC || err == EDQUOT) { + dsl_dir_t *rds; + /* + * They can get their space from either this dd, or the + * root dd. + */ + for (rds = dd; rds->dd_parent; rds = rds->dd_parent) + continue; + dmu_tx_abort(tx); + tx = dmu_tx_create_ds(rds); + dmu_tx_hold_space(tx, space); + err = dmu_tx_assign(tx, TXG_WAIT); + } + if (err) { + dmu_tx_abort(tx); + return (err); + } + + txg = dmu_tx_get_txg(tx); + mutex_enter(&dd->dd_lock); + if (dd->dd_sync_txg != 0) { + mutex_exit(&dd->dd_lock); + dmu_tx_commit(tx); + txg_wait_synced(dp, 0); + goto again; + } + + /* We're good to go */ + + dd->dd_sync_txg = txg; + dd->dd_sync_func = func; + dd->dd_sync_arg = arg; + + mutex_exit(&dd->dd_lock); + + dsl_dir_dirty(dd, tx); + dmu_tx_commit(tx); + + txg_wait_synced(dp, txg); + + mutex_enter(&dd->dd_lock); + ASSERT(dd->dd_sync_txg == txg); + ASSERT(dd->dd_sync_func == NULL); + err = dd->dd_sync_err; + dd->dd_sync_txg = 0; + mutex_exit(&dd->dd_lock); + + return (err); +} + +void +dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + + ASSERT(dd->dd_phys); + + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(dd->dd_dbuf, dd); + } +} + +static int64_t +parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) +{ + uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); + uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); + return (new_accounted - old_accounted); +} + +void +dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) +{ + if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) { + dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx); + dd->dd_sync_func = NULL; + } + + ASSERT(dmu_tx_is_syncing(tx)); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); + dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; + dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; + mutex_exit(&dd->dd_lock); + + /* release the hold from dsl_dir_dirty */ + dmu_buf_remove_ref(dd->dd_dbuf, dd); +} + +static uint64_t +dsl_dir_estimated_space(dsl_dir_t *dd) +{ + int64_t space; + int i; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + space = dd->dd_used_bytes; + ASSERT(space >= 0); + for (i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i&TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); + } + return (space); +} + +/* + * How much space would dd have available if ancestor had delta applied + * to it? If ondiskonly is set, we're only interested in what's + * on-disk, not estimated pending changes. + */ +static uint64_t +dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly) +{ + uint64_t parentspace, myspace, quota, used; + + /* + * If there are no restrictions otherwise, assume we have + * unlimited space available. + */ + quota = UINT64_MAX; + parentspace = UINT64_MAX; + + if (dd->dd_parent != NULL) { + parentspace = dsl_dir_space_available(dd->dd_parent, + ancestor, delta, ondiskonly); + } + + mutex_enter(&dd->dd_lock); + if (dd->dd_phys->dd_quota != 0) + quota = dd->dd_phys->dd_quota; + if (ondiskonly) { + used = dd->dd_used_bytes; + } else { + used = dsl_dir_estimated_space(dd); + } + if (dd == ancestor) + used += delta; + + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE); + quota = MIN(quota, poolsize); + } + + if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { + /* + * We have some space reserved, in addition to what our + * parent gave us. + */ + parentspace += dd->dd_phys->dd_reserved - used; + } + + if (used > quota) { + /* over quota */ + myspace = 0; +#ifdef ZFS_DEBUG + { + /* + * While it's OK to be a little over quota, if + * we think we are using more space than there + * is in the pool (which is already 6% more than + * dsl_pool_adjustedsize()), something is very + * wrong. + */ + uint64_t space = spa_get_space(dd->dd_pool->dp_spa); + ASSERT3U(used, <=, space); + } +#endif + } else { + /* + * the lesser of parent's space and the space + * left in our quota + */ + myspace = MIN(parentspace, quota - used); + } + + mutex_exit(&dd->dd_lock); + + return (myspace); +} + +struct tempreserve { + list_node_t tr_node; + dsl_dir_t *tr_ds; + uint64_t tr_size; +}; + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and thus + * dsl_dir_willuse_space() has been called), the reservation should + * be canceled, using dsl_dir_tempreserve_clear(). + */ +static int +dsl_dir_tempreserve_impl(dsl_dir_t *dd, + uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) +{ + uint64_t txg = tx->tx_txg; + uint64_t est_used, quota, parent_rsrv; + int edquot = EDQUOT; + int txgidx = txg & TXG_MASK; + int i; + struct tempreserve *tr; + + ASSERT3U(txg, !=, 0); + + mutex_enter(&dd->dd_lock); + /* + * Check against the dsl_dir's quota. We don't add in the delta + * when checking for over-quota because they get one free hit. + */ + est_used = dsl_dir_estimated_space(dd); + for (i = 0; i < TXG_SIZE; i++) + est_used += dd->dd_tempreserved[i]; + + quota = UINT64_MAX; + + if (dd->dd_phys->dd_quota) + quota = dd->dd_phys->dd_quota; + + /* + * If this transaction will result in a net free of space, we want + * to let it through, but we have to be careful: the space that it + * frees won't become available until *after* this txg syncs. + * Therefore, to ensure that it's possible to remove files from + * a full pool without inducing transient overcommits, we throttle + * netfree transactions against a quota that is slightly larger, + * but still within the pool's allocation slop. In cases where + * we're very close to full, this will allow a steady trickle of + * removes to get through. + */ + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); + if (poolsize < quota) { + quota = poolsize; + edquot = ENOSPC; + } + } else if (netfree) { + quota = UINT64_MAX; + } + + /* + * If they are requesting more space, and our current estimate + * is over quota. They get to try again unless the actual + * on-disk is over quota. + */ + if (asize > 0 && est_used > quota) { + if (dd->dd_used_bytes < quota) + edquot = ERESTART; + dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " + "quota=%lluK tr=%lluK err=%d\n", + dd->dd_used_bytes>>10, est_used>>10, + quota>>10, asize>>10, edquot); + mutex_exit(&dd->dd_lock); + return (edquot); + } + + /* We need to up our estimated delta before dropping dd_lock */ + dd->dd_tempreserved[txgidx] += asize; + + parent_rsrv = parent_delta(dd, est_used, asize); + mutex_exit(&dd->dd_lock); + + tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = dd; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + /* see if it's OK with our parent */ + if (dd->dd_parent && parent_rsrv) { + return (dsl_dir_tempreserve_impl(dd->dd_parent, + parent_rsrv, netfree, tr_list, tx)); + } else { + return (0); + } +} + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and thus + * dsl_dir_willuse_space() has been called), the reservation should + * be canceled, using dsl_dir_tempreserve_clear(). + */ +int +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, + uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) +{ + int err = 0; + list_t *tr_list; + + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(tr_list, sizeof (struct tempreserve), + offsetof(struct tempreserve, tr_node)); + + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + tr_list, tx); + + if (err == 0) { + struct tempreserve *tr; + + err = arc_tempreserve_space(lsize); + if (err == 0) { + tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = NULL; + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + } + } + + if (err) + dsl_dir_tempreserve_clear(tr_list, tx); + else + *tr_cookiep = tr_list; + return (err); +} + +/* + * Clear a temporary reservation that we previously made with + * dsl_dir_tempreserve_space(). + */ +void +dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) +{ + int txgidx = tx->tx_txg & TXG_MASK; + list_t *tr_list = tr_cookie; + struct tempreserve *tr; + + ASSERT3U(tx->tx_txg, !=, 0); + + while (tr = list_head(tr_list)) { + if (tr->tr_ds == NULL) { + arc_tempreserve_clear(tr->tr_size); + } else { + mutex_enter(&tr->tr_ds->dd_lock); + ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, + tr->tr_size); + tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; + mutex_exit(&tr->tr_ds->dd_lock); + } + list_remove(tr_list, tr); + kmem_free(tr, sizeof (struct tempreserve)); + } + + kmem_free(tr_list, sizeof (list_t)); +} + +/* + * Call in open context when we think we're going to write/free space, + * eg. when dirtying data. Be conservative (ie. OK to write less than + * this or free more than this, but don't write more or free less). + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + int64_t parent_space; + uint64_t est_used; + + mutex_enter(&dd->dd_lock); + if (space > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; + + est_used = dsl_dir_estimated_space(dd); + parent_space = parent_delta(dd, est_used, space); + mutex_exit(&dd->dd_lock); + + /* Make sure that we clean up dd_space_to* */ + dsl_dir_dirty(dd, tx); + + /* XXX this is potentially expensive and unnecessary... */ + if (parent_space && dd->dd_parent) + dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); +} + +/* call from syncing context when we actually write/free space for this dd */ +void +dsl_dir_diduse_space(dsl_dir_t *dd, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_dirty(dd, tx); + + mutex_enter(&dd->dd_lock); + accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || + dd->dd_phys->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); + dd->dd_used_bytes += used; + if (used > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used; + dd->dd_phys->dd_uncompressed_bytes += uncompressed; + dd->dd_phys->dd_compressed_bytes += compressed; + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_space(dd->dd_parent, + accounted_delta, compressed, uncompressed, tx); + } +} + +static int +dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + uint64_t *quotap = arg; + uint64_t new_quota = *quotap; + int err = 0; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved || + new_quota < dsl_dir_estimated_space(dd))) { + err = ENOSPC; + } else { + dd->dd_phys->dd_quota = new_quota; + } + mutex_exit(&dd->dd_lock); + return (err); +} + +int +dsl_dir_set_quota(const char *ddname, uint64_t quota) +{ + dsl_dir_t *dd; + int err; + + dd = dsl_dir_open(ddname, FTAG, NULL); + if (dd == NULL) + return (ENOENT); + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, "a, 0); + dsl_dir_close(dd, FTAG); + return (err); +} + +static int +dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + uint64_t *reservationp = arg; + uint64_t new_reservation = *reservationp; + uint64_t used, avail; + int64_t delta; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + mutex_enter(&dd->dd_lock); + used = dd->dd_used_bytes; + delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent) { + avail = dsl_dir_space_available(dd->dd_parent, + NULL, 0, FALSE); + } else { + avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; + } + + if (delta > 0 && delta > avail) + return (ENOSPC); + if (delta > 0 && dd->dd_phys->dd_quota > 0 && + new_reservation > dd->dd_phys->dd_quota) + return (ENOSPC); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_reserved = new_reservation; + + if (dd->dd_parent != NULL) { + /* Roll up this additional usage into our ancestors */ + dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); + } + return (0); +} + +int +dsl_dir_set_reservation(const char *ddname, uint64_t reservation) +{ + dsl_dir_t *dd; + int err; + + dd = dsl_dir_open(ddname, FTAG, NULL); + if (dd == NULL) + return (ENOENT); + err = dsl_dir_sync_task(dd, + dsl_dir_set_reservation_sync, &reservation, 0); + dsl_dir_close(dd, FTAG); + return (err); +} + +static dsl_dir_t * +closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) +{ + for (; ds1; ds1 = ds1->dd_parent) { + dsl_dir_t *dd; + for (dd = ds2; dd; dd = dd->dd_parent) { + if (ds1 == dd) + return (dd); + } + } + return (NULL); +} + +/* + * If delta is applied to dd, how much of that delta would be applied to + * ancestor? Syncing context only. + */ +static int64_t +would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) +{ + if (dd == ancestor) + return (delta); + + mutex_enter(&dd->dd_lock); + delta = parent_delta(dd, dd->dd_used_bytes, delta); + mutex_exit(&dd->dd_lock); + return (would_change(dd->dd_parent, delta, ancestor)); +} + +int +dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + const char *newname = arg; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dir_t *newpds; + const char *tail; + int err, len; + + /* can't rename to different pool */ + len = strlen(dp->dp_root_dir->dd_myname); + if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) || + newname[len] != '/') { + return (ENXIO); + } + + newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail); + + /* new parent should exist */ + if (newpds == NULL) + return (ENOENT); + + /* new name should not already exist */ + if (tail == NULL) { + dsl_dir_close(newpds, FTAG); + return (EEXIST); + } + + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + + /* There should be 2 references: the open and the dirty */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) { + rw_exit(&dp->dp_config_rwlock); + dsl_dir_close(newpds, FTAG); + return (EBUSY); + } + + if (newpds != dd->dd_parent) { + dsl_dir_t *ancestor; + int64_t adelta; + uint64_t myspace, avail; + + ancestor = closest_common_ancestor(dd, newpds); + + /* no rename into our descendent */ + if (ancestor == dd) { + dsl_dir_close(newpds, FTAG); + rw_exit(&dp->dp_config_rwlock); + return (EINVAL); + } + + myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + adelta = would_change(dd->dd_parent, -myspace, ancestor); + avail = dsl_dir_space_available(newpds, + ancestor, adelta, FALSE); + if (avail < myspace) { + dsl_dir_close(newpds, FTAG); + rw_exit(&dp->dp_config_rwlock); + return (ENOSPC); + } + + /* The point of no (unsuccessful) return */ + + dsl_dir_diduse_space(dd->dd_parent, -myspace, + -dd->dd_phys->dd_compressed_bytes, + -dd->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(newpds, myspace, + dd->dd_phys->dd_compressed_bytes, + dd->dd_phys->dd_uncompressed_bytes, tx); + } + + /* The point of no (unsuccessful) return */ + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + /* remove from old parent zapobj */ + err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, tx); + ASSERT3U(err, ==, 0); + + (void) strcpy(dd->dd_myname, tail); + dsl_dir_close(dd->dd_parent, dd); + dd->dd_phys->dd_parent_obj = newpds->dd_object; + dd->dd_parent = dsl_dir_open_obj(dd->dd_pool, + newpds->dd_object, NULL, dd); + + /* add to new parent zapobj */ + err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx); + ASSERT3U(err, ==, 0); + + dsl_dir_close(newpds, FTAG); + rw_exit(&dp->dp_config_rwlock); + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c new file mode 100644 index 0000000000..5b71ccfaa9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -0,0 +1,233 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" + +static dsl_dir_t * +dsl_pool_open_mos_dir(dsl_pool_t *dp) +{ + uint64_t obj; + int err; + + err = zap_lookup(dp->dp_meta_objset, + dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, + MOS_DIR_NAME, sizeof (obj), 1, &obj); + ASSERT3U(err, ==, 0); + + return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp)); +} + +static dsl_pool_t * +dsl_pool_open_impl(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp; + blkptr_t *bp = spa_get_rootblkptr(spa); + + dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); + dp->dp_spa = spa; + dp->dp_meta_rootbp = *bp; + txg_init(dp, txg); + + txg_list_create(&dp->dp_dirty_datasets, + offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_dirs, + offsetof(dsl_dir_t, dd_dirty_link)); + list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); + + return (dp); +} + +dsl_pool_t * +dsl_pool_open(spa_t *spa, uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + dp->dp_meta_objset = + &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, + &dp->dp_root_dir_obj); + ASSERT3U(err, ==, 0); + + dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp); + dp->dp_mos_dir = dsl_pool_open_mos_dir(dp); + rw_exit(&dp->dp_config_rwlock); + + return (dp); +} + +void +dsl_pool_close(dsl_pool_t *dp) +{ + /* drop our reference from dsl_pool_open() */ + dsl_dir_close(dp->dp_mos_dir, dp); + dsl_dir_close(dp->dp_root_dir, dp); + + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ + dmu_objset_evict(NULL, dp->dp_meta_objset->os); + + txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_dirs); + list_destroy(&dp->dp_synced_objsets); + + arc_flush(); + txg_fini(dp); + kmem_free(dp, sizeof (dsl_pool_t)); +} + +dsl_pool_t * +dsl_pool_create(spa_t *spa, uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + dp->dp_meta_objset = &dmu_objset_create_impl(spa, + NULL, DMU_OST_META, tx)->os; + + /* create the pool directory */ + err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); + ASSERT3U(err, ==, 0); + + /* create and open the root dir */ + dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); + dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp); + + /* create and open the meta-objset dir */ + err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, + tx); + ASSERT3U(err, ==, 0); + dp->dp_mos_dir = dsl_pool_open_mos_dir(dp); + + dmu_tx_commit(tx); + + return (dp); +} + +void +dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) +{ + dmu_tx_t *tx; + objset_impl_t *mosi = dp->dp_meta_objset->os; + + tx = dmu_tx_create_assigned(dp, txg); + + do { + dsl_dir_t *dd; + dsl_dataset_t *ds; + + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + if (!list_link_active(&ds->ds_synced_link)) + list_insert_tail(&dp->dp_synced_objsets, ds); + dsl_dataset_sync(ds, tx); + } + while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) + dsl_dir_sync(dd, tx); + /* + * We need to loop since dsl_dir_sync() could create a + * new (dirty) objset. + * XXX - isn't this taken care of by the spa's sync to + * convergence loop? + */ + } while (!txg_list_empty(&dp->dp_dirty_datasets, txg)); + + if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || + list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { + dmu_objset_sync(mosi, tx); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + } + + dmu_tx_commit(tx); +} + +void +dsl_pool_zil_clean(dsl_pool_t *dp) +{ + dsl_dataset_t *ds; + + while (ds = list_head(&dp->dp_synced_objsets)) { + list_remove(&dp->dp_synced_objsets, ds); + ASSERT(ds->ds_user_ptr != NULL); + zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); + } +} + +int +dsl_pool_sync_context(dsl_pool_t *dp) +{ + /* + * Yeah, this is cheesy. But the SPA needs some way to let + * the sync threads invoke spa_open() and spa_close() while + * it holds the namespace lock. I'm certainly open to better + * ideas for how to determine whether the current thread is + * operating on behalf of spa_sync(). This works for now. + */ + return (curthread == dp->dp_tx.tx_sync_thread || + BP_IS_HOLE(&dp->dp_meta_rootbp)); +} + +uint64_t +dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) +{ + uint64_t space, resv; + + /* + * Reserve about 1% (1/128), or at least 16MB, for allocation + * efficiency. + * XXX The intent log is not accounted for, so it must fit + * within this slop. + * + * If we're trying to assess whether it's OK to do a free, + * cut the reservation in half to allow forward progress + * (e.g. make it possible to rm(1) files from a full pool). + */ + space = spa_get_space(dp->dp_spa); + resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2); + if (netfree) + resv >>= 1; + + return (space - resv); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c new file mode 100644 index 0000000000..bd54263507 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/spa.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +#include "zfs_prop.h" + +static int +dodefault(const char *propname, int intsz, int numint, void *buf) +{ + zfs_prop_t prop; + + if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL || + zfs_prop_readonly(prop)) + return (ENOENT); + + if (zfs_prop_get_type(prop) == prop_type_string) { + if (intsz != 1) + return (EOVERFLOW); + zfs_prop_default_string(prop, buf, numint); + } else { + if (intsz != 8 || numint < 1) + return (EOVERFLOW); + + *(uint64_t *)buf = zfs_prop_default_numeric(prop); + } + + return (0); +} + +static int +dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + int err = 0; + objset_t *mos = dp->dp_meta_objset; + + if (setpoint) + setpoint[0] = '\0'; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); + + while (ddobj != 0) { + dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG); + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dir_name(dd, setpoint); + dsl_dir_close(dd, FTAG); + break; + } + ASSERT3U(err, ==, ENOENT); + ddobj = dd->dd_phys->dd_parent_obj; + dsl_dir_close(dd, FTAG); + } + if (err == ENOENT) + err = dodefault(propname, intsz, numint, buf); + + return (err); +} + +/* + * Register interest in the named property. We'll call the callback + * once to notify it of the current property value, and again each time + * the property changes, until this callback is unregistered. + * + * Return 0 on success, errno if the prop is not an integer value. + */ +int +dsl_prop_register(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd; + uint64_t value; + dsl_prop_cb_record_t *cbr; + int err; + + dd = ds->ds_dir; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object, propname, + 8, 1, &value, NULL); + if (err == ENOENT) { + err = 0; + value = DSL_PROP_VALUE_UNDEFINED; + } + if (err != 0) { + rw_exit(&dd->dd_pool->dp_config_rwlock); + return (err); + } + + cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); + cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); + (void) strcpy((char *)cbr->cbr_propname, propname); + cbr->cbr_func = callback; + cbr->cbr_arg = cbarg; + mutex_enter(&dd->dd_lock); + list_insert_head(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + + cbr->cbr_func(cbr->cbr_arg, value); + + (void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr); + rw_exit(&dd->dd_pool->dp_config_rwlock); + /* Leave dataset open until this callback is unregistered */ + return (0); +} + +int +dsl_prop_get_ds(dsl_dir_t *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + int err; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object, + propname, intsz, numints, buf, setpoint); + rw_exit(&dd->dd_pool->dp_config_rwlock); + + return (err); +} + +int +dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + dsl_dir_t *dd; + const char *tail; + int err; + + dd = dsl_dir_open(ddname, FTAG, &tail); + if (dd == NULL) + return (ENOENT); + if (tail && tail[0] != '@') { + dsl_dir_close(dd, FTAG); + return (ENOENT); + } + + err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint); + + dsl_dir_close(dd, FTAG); + return (err); +} + +/* + * Return 0 on success, ENOENT if ddname is invalid, EOVERFLOW if + * valuelen not big enough. + */ +int +dsl_prop_get_string(const char *ddname, const char *propname, + char *value, int valuelen, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 1, valuelen, value, setpoint)); +} + +/* + * Get the current property value. It may have changed by the time this + * function returns, so it is NOT safe to follow up with + * dsl_prop_register() and assume that the value has not changed in + * between. + * + * Return 0 on success, ENOENT if ddname is invalid. + */ +int +dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); +} + +int +dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get_ds(dd, propname, 8, 1, valuep, setpoint)); +} + +/* + * Unregister this callback. Return 0 on success, ENOENT if ddname is + * invalid, ENOMSG if no matching callback registered. + */ +int +dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd; + dsl_prop_cb_record_t *cbr; + + dd = ds->ds_dir; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_propname, propname) == 0 && + cbr->cbr_func == callback && + cbr->cbr_arg == cbarg) + break; + } + + if (cbr == NULL) { + mutex_exit(&dd->dd_lock); + return (ENOMSG); + } + + list_remove(&dd->dd_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + + /* Clean up from dsl_prop_register */ + dsl_dir_close(dd, cbr); + return (0); +} + +static void +dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, + const char *propname, uint64_t value, int first) +{ + dsl_dir_t *dd; + dsl_prop_cb_record_t *cbr; + objset_t *mos = dp->dp_meta_objset; + int err; + + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG); + + if (!first) { + /* + * If the prop is set here, then this change is not + * being inherited here or below; stop the recursion. + */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, + 8, 1, &value); + if (err == 0) { + dsl_dir_close(dd, FTAG); + return; + } + ASSERT3U(err, ==, ENOENT); + } + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); + cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_propname, propname) == 0) { + cbr->cbr_func(cbr->cbr_arg, value); + } + } + mutex_exit(&dd->dd_lock); + + if (dd->dd_phys->dd_child_dir_zapobj) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, mos, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + /* XXX recursion could blow stack; esp. za! */ + dsl_prop_changed_notify(dp, za.za_first_integer, + propname, value, FALSE); + } + } + dsl_dir_close(dd, FTAG); +} + +struct prop_set_arg { + const char *name; + int intsz; + int numints; + const void *buf; +}; + +static int +dsl_prop_set_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) +{ + struct prop_set_arg *psa = arg; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; + uint64_t intval; + int err, isint; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER); + + isint = (dodefault(psa->name, 8, 1, &intval) == 0); + + if (psa->numints == 0) { + err = zap_remove(mos, zapobj, psa->name, tx); + if (err == ENOENT) /* that's fine. */ + err = 0; + if (err == 0 && isint) { + err = dsl_prop_get_impl(dd->dd_pool, + dd->dd_phys->dd_parent_obj, psa->name, + 8, 1, &intval, NULL); + } + } else { + err = zap_update(mos, zapobj, psa->name, + psa->intsz, psa->numints, psa->buf, tx); + if (isint) + intval = *(uint64_t *)psa->buf; + } + + if (err == 0 && isint) { + dsl_prop_changed_notify(dd->dd_pool, + dd->dd_object, psa->name, intval, TRUE); + } + rw_exit(&dd->dd_pool->dp_config_rwlock); + + return (err); +} + +int +dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf) +{ + dsl_dir_t *dd; + int err; + struct prop_set_arg psa; + + dd = dsl_dir_open(ddname, FTAG, NULL); + if (dd == NULL) + return (ENOENT); + + psa.name = propname; + psa.intsz = intsz; + psa.numints = numints; + psa.buf = buf; + err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0); + + dsl_dir_close(dd, FTAG); + + return (err); +} diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c new file mode 100644 index 0000000000..03186d1387 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/fletcher.c @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/byteorder.h> +#include <sys/spa.h> + +void +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/usr/src/uts/common/fs/zfs/lzjb.c b/usr/src/uts/common/fs/zfs/lzjb.c new file mode 100644 index 0000000000..5979a55ef7 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/lzjb.c @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This is stolen from common/os/compress.c and will be removed once + * our changes have made it into the on10 source base. + * + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and return -1 if the data will not + * compress to d_len or less. + */ + +#include <sys/types.h> + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 256 + +size_t +lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *cpy, *copymap; + int copymask = 1 << (NBBY - 1); + int mlen, offset; + uint16_t *hp; + uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ + + while (src < (uchar_t *)s_start + s_len) { + if ((copymask <<= 1) == (1 << NBBY)) { + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { + if (d_len != s_len) + return (s_len); + mlen = s_len; + for (src = s_start, dst = d_start; mlen; mlen--) + *dst++ = *src++; + return (s_len); + } + copymask = 1; + copymap = dst; + *dst++ = 0; + } + if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { + *dst++ = *src++; + continue; + } + hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & + (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + if (cpy >= (uchar_t *)s_start && cpy != src && + src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { + *copymap |= copymask; + for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + if (src[mlen] != cpy[mlen]) + break; + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | + (offset >> NBBY); + *dst++ = (uchar_t)offset; + src += mlen; + } else { + *dst++ = *src++; + } + } + return (dst - (uchar_t *)d_start); +} + +/*ARGSUSED*/ +int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *d_end = (uchar_t *)d_start + d_len; + uchar_t *cpy, copymap; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (uchar_t *)d_start) + return (-1); + while (--mlen >= 0 && dst < d_end) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c new file mode 100644 index 0000000000..9d682e4990 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -0,0 +1,796 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/space_map.h> +#include <sys/metaslab_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +/* + * ========================================================================== + * Metaslab classes + * ========================================================================== + */ +metaslab_class_t * +metaslab_class_create(void) +{ + metaslab_class_t *mc; + + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + + mc->mc_rotor = NULL; + + return (mc); +} + +void +metaslab_class_destroy(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + + while ((mg = mc->mc_rotor) != NULL) { + metaslab_class_remove(mc, mg); + metaslab_group_destroy(mg); + } + + kmem_free(mc, sizeof (metaslab_class_t)); +} + +void +metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == NULL); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; + mg->mg_class = mc; +} + +void +metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +{ + metaslab_group_t *mgprev, *mgnext; + + ASSERT(mg->mg_class == mc); + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; + mg->mg_class = NULL; +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +static int +metaslab_compare(const void *x1, const void *x2) +{ + const metaslab_t *m1 = x1; + const metaslab_t *m2 = x2; + + if (m1->ms_weight < m2->ms_weight) + return (1); + if (m1->ms_weight > m2->ms_weight) + return (-1); + + /* + * If the weights are identical, use the offset to force uniqueness. + */ + if (m1->ms_map.sm_start < m2->ms_map.sm_start) + return (-1); + if (m1->ms_map.sm_start > m2->ms_map.sm_start) + return (1); + + ASSERT3P(m1, ==, m2); + + return (0); +} + +metaslab_group_t * +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +{ + metaslab_group_t *mg; + + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&mg->mg_metaslab_tree, metaslab_compare, + sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */ + mg->mg_vd = vd; + metaslab_class_add(mc, mg); + + return (mg); +} + +void +metaslab_group_destroy(metaslab_group_t *mg) +{ + avl_destroy(&mg->mg_metaslab_tree); + mutex_destroy(&mg->mg_lock); + kmem_free(mg, sizeof (metaslab_group_t)); +} + +void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == NULL); + msp->ms_group = mg; + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_group = NULL; + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); +} + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ +void +metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp, + uint64_t start, uint64_t size, uint64_t txg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_t *msp; + int fm; + + msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + + msp->ms_smo = smo; + + space_map_create(&msp->ms_map, start, size, vd->vdev_ashift, + &msp->ms_lock); + + for (fm = 0; fm < TXG_SIZE; fm++) { + space_map_create(&msp->ms_allocmap[fm], start, size, + vd->vdev_ashift, &msp->ms_lock); + space_map_create(&msp->ms_freemap[fm], start, size, + vd->vdev_ashift, &msp->ms_lock); + } + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + * We enforce this by assigning an initial weight of 0 to new space. + * + * (Transactional allocations for this txg would actually be OK; + * it's intent log allocations that cause trouble. If we wrote + * a log block in this txg and lost power, the log replay would be + * based on the DVA translations that had been synced in txg - 1. + * Those translations would not include this metaslab's vdev.) + */ + metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size); + + if (txg == 0) { + /* + * We're opening the pool. Make the metaslab's + * free space available immediately. + */ + vdev_space_update(vd, size, smo->smo_alloc); + metaslab_sync_done(msp, 0); + } else { + /* + * We're adding a new metaslab to an already-open pool. + * Declare all of the metaslab's space to be free. + * + * Note that older transaction groups cannot allocate + * from this metaslab until its existence is committed, + * because we set ms_last_alloc to the current txg. + */ + smo->smo_alloc = 0; + msp->ms_usable_space = size; + mutex_enter(&msp->ms_lock); + space_map_add(&msp->ms_map, start, size); + msp->ms_map_incore = 1; + mutex_exit(&msp->ms_lock); + + /* XXX -- we'll need a call to picker_init here */ + msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD; + msp->ms_last_alloc = txg; + vdev_dirty(vd, VDD_ADD, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, txg); + } + + *mspp = msp; +} + +void +metaslab_fini(metaslab_t *msp) +{ + int fm; + metaslab_group_t *mg = msp->ms_group; + + vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, + -msp->ms_smo->smo_alloc); + + metaslab_group_remove(mg, msp); + + /* XXX -- we'll need a call to picker_fini here */ + + mutex_enter(&msp->ms_lock); + + space_map_vacate(&msp->ms_map, NULL, NULL); + msp->ms_map_incore = 0; + space_map_destroy(&msp->ms_map); + + for (fm = 0; fm < TXG_SIZE; fm++) { + space_map_destroy(&msp->ms_allocmap[fm]); + space_map_destroy(&msp->ms_freemap[fm]); + } + + mutex_exit(&msp->ms_lock); + + kmem_free(msp, sizeof (metaslab_t)); +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *os = spa->spa_meta_objset; + space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; + space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_obj_t *smo = msp->ms_smo; + uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; + uint64_t alloc_delta; + dmu_buf_t *db; + dmu_tx_t *tx; + + dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start); + + mutex_enter(&msp->ms_lock); + + if (*dirty & MSD_ADD) + vdev_space_update(vd, msp->ms_map.sm_size, 0); + + if (*dirty & (MSD_ALLOC | MSD_FREE)) { + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + smo->smo_object = dmu_object_alloc(os, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) * + (msp->ms_map.sm_start >> vd->vdev_ms_shift), + sizeof (uint64_t), &smo->smo_object, tx); + } + + alloc_delta = allocmap->sm_space - freemap->sm_space; + vdev_space_update(vd, 0, alloc_delta); + smo->smo_alloc += alloc_delta; + + if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 && + (*dirty & MSD_CONDENSE) == 0) { + space_map_t *sm = &msp->ms_map; + space_map_t *tsm; + int i; + + ASSERT(msp->ms_map_incore); + + space_map_merge(freemap, freed_map); + space_map_vacate(allocmap, NULL, NULL); + + /* + * Write out the current state of the allocation + * world. The current metaslab is full, minus + * stuff that's been freed this txg (freed_map), + * minus allocations from txgs in the future. + */ + space_map_add(sm, sm->sm_start, sm->sm_size); + for (i = 1; i < TXG_CONCURRENT_STATES; i++) { + tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK]; + space_map_iterate(tsm, space_map_remove, sm); + } + space_map_iterate(freed_map, space_map_remove, sm); + + space_map_write(sm, smo, os, tx); + + ASSERT(sm->sm_space == 0); + ASSERT(freemap->sm_space == 0); + ASSERT(allocmap->sm_space == 0); + + *dirty |= MSD_CONDENSE; + } else { + space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx); + space_map_sync(freemap, freed_map, smo, SM_FREE, + os, tx); + } + + db = dmu_bonus_hold(os, smo->smo_object); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(smo, db->db_data, db->db_size); + dmu_buf_rele(db); + + dmu_tx_commit(tx); + } + + *dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD); + + mutex_exit(&msp->ms_lock); + + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + uint64_t weight; + uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK]; + space_map_obj_t *smo = msp->ms_smo; + + dprintf("%s offset %llx txg %llu\n", + vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg); + + mutex_enter(&msp->ms_lock); + + ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0); + + msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc; + msp->ms_usable_end = smo->smo_objsize; + + weight = msp->ms_usable_space; + + if (txg != 0) { + space_map_t *freed_map = + &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + + /* XXX -- we'll need a call to picker_fini here */ + + /* If we're empty, don't bother sticking around */ + if (msp->ms_usable_space == 0) { + space_map_vacate(&msp->ms_map, NULL, NULL); + msp->ms_map_incore = 0; + ASSERT3U(freed_map->sm_space, ==, 0); + weight = 0; + } else { + /* Add the freed blocks to the available space map */ + if (msp->ms_map_incore) + space_map_merge(freed_map, &msp->ms_map); + else + space_map_vacate(freed_map, NULL, NULL); + weight += msp->ms_map.sm_size; + } + + if (msp->ms_last_alloc == txg) + /* Safe to use for allocation now */ + msp->ms_last_alloc = 0; + + *dirty = 0; + } + + mutex_exit(&msp->ms_lock); + + metaslab_group_sort(msp->ms_group, msp, weight); +} + +/* + * The first-fit block picker. No picker_init or picker_fini, + * this is just an experiment to see how it feels to separate out + * the block selection policy from the map updates. + * Note: the 'cursor' argument is a form of PPD. + */ +static uint64_t +metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + space_seg_t *ss, ssearch; + avl_index_t where; + int tried_once = 0; + +again: + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + while (ss != NULL) { + uint64_t offset = P2ROUNDUP(ss->ss_start, align); + + if (offset + size <= ss->ss_end) { + *cursor = offset + size; + return (offset); + } + ss = AVL_NEXT(t, ss); + } + + /* If we couldn't find a block after cursor, search again */ + if (tried_once == 0) { + tried_once = 1; + *cursor = 0; + goto again; + } + + return (-1ULL); +} + +static uint64_t +metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + space_map_t *sm = &msp->ms_map; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t offset; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_map_incore); + ASSERT(sm->sm_space != 0); + ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0); + + offset = metaslab_pick_block(sm, size, + &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]); + if (offset != -1ULL) { + space_map_remove(sm, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + } + return (offset); +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +int +metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + objset_t *os = spa->spa_meta_objset; + vdev_t *vd; + metaslab_t *msp; + space_map_t *sm; + space_map_obj_t *smo; + int error; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) + return (ENXIO); + + if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + sm = &msp->ms_map; + smo = msp->ms_smo; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + if (msp->ms_map_incore == 0) { + error = space_map_load(sm, smo, SM_FREE, os, + msp->ms_usable_end, sm->sm_size - msp->ms_usable_space); + ASSERT(error == 0); + if (error) { + mutex_exit(&msp->ms_lock); + return (error); + } + msp->ms_map_incore = 1; + /* XXX -- we'll need a call to picker_init here */ + bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); + } + + space_map_remove(sm, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { + msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; + msp->ms_last_alloc = txg; + vdev_dirty(vd, VDD_ALLOC, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, txg); + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +static int +metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + /* + * Enforce segregation across transaction groups. + */ + /* XXX -- We should probably not assume we know what ms_weight means */ + if (msp->ms_last_alloc == txg) + return (msp->ms_map.sm_space >= size && msp->ms_weight >= size); + + if (msp->ms_last_alloc != 0) + return (0); + + if (msp->ms_map.sm_space >= size && msp->ms_weight >= size) + return (1); + + /* XXX -- the weight test should be in terms of MINFREE */ + return (msp->ms_usable_space >= size && msp->ms_weight >= size); +} + +static metaslab_t * +metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg) +{ + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + + mutex_enter(&mg->mg_lock); + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) + if (metaslab_usable(msp, size, txg)) + break; + mutex_exit(&mg->mg_lock); + + return (msp); +} + +static metaslab_t * +metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size, + uint64_t *offp, uint64_t txg) +{ + metaslab_t *msp; + int error; + + while ((msp = metaslab_pick(mg, size, txg)) != NULL) { + space_map_obj_t *smo = msp->ms_smo; + mutex_enter(&msp->ms_lock); + if (!metaslab_usable(msp, size, txg)) { + mutex_exit(&msp->ms_lock); + continue; + } + if (msp->ms_map_incore == 0) { + error = space_map_load(&msp->ms_map, smo, SM_FREE, + spa->spa_meta_objset, msp->ms_usable_end, + msp->ms_map.sm_size - msp->ms_usable_space); + ASSERT(error == 0); + if (error) { + mutex_exit(&msp->ms_lock); + metaslab_group_sort(mg, msp, 0); + continue; + } + msp->ms_map_incore = 1; + /* XXX -- we'll need a call to picker_init here */ + bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor)); + } + *offp = metaslab_getblock(msp, size, txg); + if (*offp != -1ULL) { + if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) { + vdev_t *vd = mg->mg_vd; + msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC; + msp->ms_last_alloc = txg; + vdev_dirty(vd, VDD_ALLOC, txg); + (void) txg_list_add(&vd->vdev_ms_list, + msp, txg); + } + mutex_exit(&msp->ms_lock); + return (msp); + } + mutex_exit(&msp->ms_lock); + metaslab_group_sort(msp->ms_group, msp, size - 1); + } + + return (NULL); +} + +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) +{ + metaslab_t *msp; + metaslab_group_t *mg, *rotor; + metaslab_class_t *mc; + vdev_t *vd; + uint64_t offset = -1ULL; + uint64_t asize; + + mc = spa_metaslab_class_select(spa); + + /* + * Start at the rotor and loop through all mgs until we find something. + * Note that there's no locking on mc_rotor or mc_allocated because + * nothing actually breaks if we miss a few updates -- we just won't + * allocate quite as evenly. It all balances out over time. + */ + mg = rotor = mc->mc_rotor; + do { + vd = mg->mg_vd; + asize = vdev_psize_to_asize(vd, psize); + ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + + msp = metaslab_group_alloc(spa, mg, asize, &offset, txg); + if (msp != NULL) { + ASSERT(offset != -1ULL); + + /* + * If we've just selected this metaslab group, + * figure out whether the corresponding vdev is + * over- or under-used relative to the pool, + * and set an allocation bias to even it out. + */ + if (mc->mc_allocated == 0) { + vdev_stat_t *vs = &vd->vdev_stat; + uint64_t alloc, space; + int64_t vu, su; + + alloc = spa_get_alloc(spa); + space = spa_get_space(spa); + + /* + * Determine percent used in units of 0..1024. + * (This is just to avoid floating point.) + */ + vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); + su = (alloc << 10) / (space + 1); + + /* + * Bias by at most +/- 25% of the aliquot. + */ + mg->mg_bias = ((su - vu) * + (int64_t)mg->mg_aliquot) / (1024 * 4); + + dprintf("bias = %lld\n", mg->mg_bias); + } + + if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + mg->mg_aliquot + mg->mg_bias) { + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } + + DVA_SET_VDEV(dva, vd->vdev_id); + DVA_SET_OFFSET(dva, offset); + DVA_SET_GANG(dva, 0); + DVA_SET_ASIZE(dva, asize); + + return (0); + } + mc->mc_rotor = mg->mg_next; + mc->mc_allocated = 0; + } while ((mg = mg->mg_next) != rotor); + + dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg); + + DVA_SET_VDEV(dva, 0); + DVA_SET_OFFSET(dva, 0); + DVA_SET_GANG(dva, 0); + + return (ENOSPC); +} + +/* + * Free the block represented by DVA in the context of the specified + * transaction group. + */ +void +metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + + if (txg > spa_freeze_txg(spa)) + return; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { + cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", + (u_longlong_t)vdev); + ASSERT(0); + return; + } + + if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", + (u_longlong_t)offset); + ASSERT(0); + return; + } + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) { + msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE; + vdev_dirty(vd, VDD_FREE, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, txg); + } + + space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); +} diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c new file mode 100644 index 0000000000..411ed46e13 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#if defined(DEBUG) || !defined(_KERNEL) + +#ifdef _KERNEL +int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +#else +int reference_tracking_enable = TRUE; +#endif +int reference_history = 4; /* tunable */ + +static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; + +void +refcount_init(void) +{ + reference_cache = kmem_cache_create("reference_cache", + sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +refcount_fini(void) +{ + kmem_cache_destroy(reference_cache); + kmem_cache_destroy(reference_history_cache); +} + +void +refcount_create(refcount_t *rc) +{ + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&rc->rc_removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +refcount_destroy_many(refcount_t *rc, uint64_t number) +{ + reference_t *ref; + + ASSERT(rc->rc_count == number); + while (ref = list_head(&rc->rc_list)) { + list_remove(&rc->rc_list, ref); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_list); + + while (ref = list_head(&rc->rc_removed)) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_removed); + mutex_destroy(&rc->rc_mtx); +} + +void +refcount_destroy(refcount_t *rc) +{ + refcount_destroy_many(rc, 0); +} + +int +refcount_is_zero(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count == 0); +} + +int64_t +refcount_count(refcount_t *rc) +{ + ASSERT(rc->rc_count >= 0); + return (rc->rc_count); +} + +int64_t +refcount_add_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + if (reference_tracking_enable) { + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; + } + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= 0); + if (reference_tracking_enable) + list_insert_head(&rc->rc_list, ref); + rc->rc_count += number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + + return (count); +} + +int64_t +refcount_add(refcount_t *rc, void *holder) +{ + return (refcount_add_many(rc, 1, holder)); +} + +int64_t +refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) +{ + reference_t *ref; + int64_t count; + + mutex_enter(&rc->rc_mtx); + ASSERT(rc->rc_count >= number); + + if (!reference_tracking_enable) { + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count >= reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + } + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); +} + +int64_t +refcount_remove(refcount_t *rc, void *holder) +{ + return (refcount_remove_many(rc, 1, holder)); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c new file mode 100644 index 0000000000..ce5c26131a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sha256.c @@ -0,0 +1,131 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * SHA-256 checksum, as specified in FIPS 180-2, available at: + * http://csrc.nist.gov/cryptval + * + * This is a very compact implementation of SHA-256. + * It is designed to be simple and portable, not to be fast. + */ + +/* + * The literal definitions according to FIPS180-2 would be: + * + * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) + * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) + * + * We use logical equivalents which require one less op. + */ +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) +#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) +#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) +#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) +#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) +#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) + +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +static void +SHA256Transform(uint32_t *H, const uint8_t *cp) +{ + uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; + + for (t = 0; t < 16; t++, cp += 4) + W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; + + for (t = 16; t < 64; t++) + W[t] = sigma1(W[t - 2]) + W[t - 7] + + sigma0(W[t - 15]) + W[t - 16]; + + a = H[0]; b = H[1]; c = H[2]; d = H[3]; + e = H[4]; f = H[5]; g = H[6]; h = H[7]; + + for (t = 0; t < 64; t++) { + T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; + T2 = SIGMA0(a) + Maj(a, b, c); + h = g; g = f; f = e; e = d + T1; + d = c; c = b; b = a; a = T1 + T2; + } + + H[0] += a; H[1] += b; H[2] += c; H[3] += d; + H[4] += e; H[5] += f; H[6] += g; H[7] += h; +} + +void +zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + uint8_t pad[128]; + int padsize = size & 63; + int i; + + for (i = 0; i < size - padsize; i += 64) + SHA256Transform(H, (uint8_t *)buf + i); + + for (i = 0; i < padsize; i++) + pad[i] = ((uint8_t *)buf)[i]; + + for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) + pad[padsize] = 0; + + for (i = 0; i < 8; i++) + pad[padsize++] = (size << 3) >> (56 - 8 * i); + + for (i = 0; i < padsize; i += 64) + SHA256Transform(H, pad + i); + + ZIO_SET_CHECKSUM(zcp, + (uint64_t)H[0] << 32 | H[1], + (uint64_t)H[2] << 32 | H[3], + (uint64_t)H[4] << 32 | H[5], + (uint64_t)H[6] << 32 | H[7]); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c new file mode 100644 index 0000000000..43112d9319 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -0,0 +1,1784 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file contains all the routines used when modifying on-disk SPA state. + * This includes opening, importing, destroying, exporting a pool, and syncing a + * pool. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/dmu_traverse.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/fs/zfs.h> +#include <sys/callb.h> + +static uint32_t spa_active_count; + +/* + * ========================================================================== + * SPA state manipulation (open/create/destroy/import/export) + * ========================================================================== + */ + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa) +{ + int t; + + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + + spa->spa_normal_class = metaslab_class_create(); + + spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", + 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); + + for (t = 0; t < ZIO_TYPES; t++) { + spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", + 8, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", + 8, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + } + + rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); + + list_create(&spa->spa_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_dirty_node)); + + txg_list_create(&spa->spa_vdev_txg_list, + offsetof(struct vdev, vdev_txg_node)); +} + +/* + * Opposite of spa_activate(). + */ +static void +spa_deactivate(spa_t *spa) +{ + int t; + + ASSERT(spa->spa_sync_on == B_FALSE); + ASSERT(spa->spa_dsl_pool == NULL); + ASSERT(spa->spa_root_vdev == NULL); + + ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + + txg_list_destroy(&spa->spa_vdev_txg_list); + + list_destroy(&spa->spa_dirty_list); + + rw_destroy(&spa->spa_traverse_lock); + + for (t = 0; t < ZIO_TYPES; t++) { + taskq_destroy(spa->spa_zio_issue_taskq[t]); + taskq_destroy(spa->spa_zio_intr_taskq[t]); + spa->spa_zio_issue_taskq[t] = NULL; + spa->spa_zio_intr_taskq[t] = NULL; + } + + taskq_destroy(spa->spa_vdev_retry_taskq); + spa->spa_vdev_retry_taskq = NULL; + + metaslab_class_destroy(spa->spa_normal_class); + spa->spa_normal_class = NULL; + + spa->spa_state = POOL_STATE_UNINITIALIZED; +} + +/* + * Verify a pool configuration, and construct the vdev tree appropriately. This + * will create all the necessary vdevs in the appropriate layout, with each vdev + * in the CLOSED state. This will prep the pool before open/creation/import. + * All vdev validation is done by the vdev_alloc() routine. + */ +static vdev_t * +spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) +{ + nvlist_t **child; + uint_t c, children; + vdev_t *vd; + + if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) + return (NULL); + + if (vd->vdev_ops->vdev_op_leaf) + return (vd); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + vdev_free(vd); + return (NULL); + } + + for (c = 0; c < children; c++) { + if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { + vdev_free(vd); + return (NULL); + } + } + + return (vd); +} + +/* + * Opposite of spa_load(). + */ +static void +spa_unload(spa_t *spa) +{ + /* + * Stop syncing. + */ + if (spa->spa_sync_on) { + txg_sync_stop(spa->spa_dsl_pool); + spa->spa_sync_on = B_FALSE; + } + + /* + * Wait for any outstanding prefetch I/O to complete. + */ + spa_config_enter(spa, RW_WRITER); + spa_config_exit(spa); + + /* + * Close the dsl pool. + */ + if (spa->spa_dsl_pool) { + dsl_pool_close(spa->spa_dsl_pool); + spa->spa_dsl_pool = NULL; + } + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) { + vdev_free(spa->spa_root_vdev); + spa->spa_root_vdev = NULL; + } +} + +/* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. The 'readonly' flag will prevent us + * from writing any updated state to disk, and can be use when testing a pool + * for import. + */ +static int +spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) +{ + int error = 0; + nvlist_t *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t pool_guid; + zio_t *zio; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) + return (EINVAL); + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if (import && spa_guid_exists(pool_guid, 0)) + return (EEXIST); + + /* + * Parse the configuration into a vdev tree. + */ + spa_config_enter(spa, RW_WRITER); + rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); + spa_config_exit(spa); + + if (rvd == NULL) + return (EINVAL); + + spa->spa_root_vdev = rvd; + ASSERT(spa_guid(spa) == pool_guid); + + /* + * Try to open all vdevs, loading each label in the process. + */ + if (vdev_open(rvd) != 0) + return (ENXIO); + + /* + * Find the best uberblock. + */ + bzero(ub, sizeof (uberblock_t)); + + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + vdev_uberblock_load(zio, rvd, ub); + error = zio_wait(zio); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) { + dprintf("ub_txg is zero\n"); + return (ENXIO); + } + + /* + * If the vdev guid sum doesn't match the uberblock, we have an + * incomplete configuration. + */ + if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { + rvd->vdev_state = VDEV_STATE_CANT_OPEN; + rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; + dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", + rvd->vdev_guid_sum, ub->ub_guid_sum); + return (ENXIO); + } + + /* + * Initialize internal SPA structures. + */ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + VERIFY(zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object) == 0); + + if (!mosconfig) { + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + nvlist_t *newconfig = NULL; + + db = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object); + dmu_buf_read(db); + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db); + + packed = kmem_alloc(nvsize, KM_SLEEP); + error = dmu_read_canfail(spa->spa_meta_objset, + spa->spa_config_object, 0, nvsize, packed); + if (error == 0) + error = nvlist_unpack(packed, nvsize, &newconfig, 0); + kmem_free(packed, nvsize); + + if (error) + return (ENXIO); + + spa_config_set(spa, newconfig); + + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa); + + return (spa_load(spa, newconfig, readonly, import, B_TRUE)); + } + + VERIFY(zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); + + /* + * Load the vdev state for all top level vdevs. + */ + if ((error = vdev_load(rvd, import)) != 0) + return (error); + + /* + * Propagate the leaf DTLs we just loaded all the way up the tree. + */ + spa_config_enter(spa, RW_WRITER); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + spa_config_exit(spa); + + /* + * Check the state of the root vdev. If it can't be opened, it + * indicates one or more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + + /* + * Claim log blocks that haven't been committed yet, and update all + * top-level vdevs to sync any config changes found in vdev_load(). + * This must all happen in a single txg. + */ + if ((spa_mode & FWRITE) && !readonly) { + dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), + spa_first_txg(spa)); + dmu_objset_find(spa->spa_name, zil_claim, tx, 0); + vdev_config_dirty(rvd); + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * Wait for all claims to sync. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + + return (0); +} + +/* + * Pool Open/Import + * + * The import case is identical to an open except that the configuration is sent + * down from userland, instead of grabbed from the configuration cache. For the + * case of an open, the pool configuration will exist in the + * POOL_STATE_UNITIALIZED state. + * + * The stats information (gen/count/ustats) is used to gather vdev statistics at + * the same time open the pool, without having to keep around the spa_t in some + * ambiguous state. + */ +static int +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +{ + spa_t *spa; + int error; + int loaded = B_FALSE; + int locked = B_FALSE; + + *spapp = NULL; + + /* + * As disgusting as this is, we need to support recursive calls to this + * function because dsl_dir_open() is called during spa_load(), and ends + * up calling spa_open() again. The real fix is to figure out how to + * avoid dsl_dir_open() calling this in the first place. + */ + if (mutex_owner(&spa_namespace_lock) != curthread) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + if ((spa = spa_lookup(pool)) == NULL) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + + spa_activate(spa); + + error = spa_load(spa, spa->spa_config, + B_FALSE, B_FALSE, B_FALSE); + + if (error == EBADF) { + /* + * If vdev_load() returns EBADF, it indicates that one + * of the vdevs indicates that the pool has been + * exported or destroyed. If this is the case, the + * config cache is out of sync and we should remove the + * pool from the namespace. + */ + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + spa_config_sync(); + if (locked) + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } if (error) { + /* + * We can't open the pool, but we still have useful + * information: the state of each vdev after the + * attempted vdev_open(). Return this to the user. + */ + if (config != NULL && spa->spa_root_vdev != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, + B_TRUE); + spa_unload(spa); + spa_deactivate(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + *spapp = NULL; + return (error); + } + + loaded = B_TRUE; + } + + spa_open_ref(spa, tag); + if (locked) + mutex_exit(&spa_namespace_lock); + + *spapp = spa; + + if (config != NULL) { + spa_config_enter(spa, RW_READER); + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + spa_config_exit(spa); + } + + /* + * If we just loaded the pool, resilver anything that's out of date. + */ + if (loaded && (spa_mode & FWRITE)) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +int +spa_open(const char *name, spa_t **spapp, void *tag) +{ + return (spa_open_common(name, spapp, tag, NULL)); +} + +int +spa_get_stats(const char *name, nvlist_t **config) +{ + int error; + spa_t *spa; + + *config = NULL; + error = spa_open_common(name, &spa, FTAG, config); + + if (spa != NULL) + spa_close(spa, FTAG); + + return (error); +} + +/* + * Pool Creation + */ +int +spa_create(const char *pool, nvlist_t *nvroot, char *altroot) +{ + spa_t *spa; + dsl_pool_t *dp; + dmu_tx_t *tx; + int error; + uint64_t txg = TXG_INITIAL; + + /* + * If this pool already exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + spa = spa_add(pool); + + /* + * Allocate a new spa_t structure. + */ + spa_activate(spa); + + spa->spa_uberblock.ub_txg = txg - 1; + spa->spa_ubsync = spa->spa_uberblock; + + error = spa_vdev_add(spa, nvroot); + + if (error) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + if (altroot != NULL) { + spa->spa_root = spa_strdup(altroot); + atomic_add_32(&spa_active_count, 1); + } + + spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); + spa->spa_meta_objset = dp->dp_meta_objset; + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Create the pool config object. + */ + spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + + VERIFY(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); + + /* + * Create the deferred-free bplist object. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. + */ + spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, + 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, + ZIO_COMPRESS_OFF, tx); + + VERIFY(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); + + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + + /* + * We explicitly wait for the first transaction to complete so that our + * bean counters are appropriately updated. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + spa_config_sync(); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Import the given pool into the system. We set up the necessary spa_t and + * then call spa_load() to do the dirty work. + */ +int +spa_import(const char *pool, nvlist_t *config, char *altroot) +{ + spa_t *spa; + int error; + + if (!(spa_mode & FWRITE)) + return (EROFS); + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + /* + * Create an initialize the spa structure + */ + spa = spa_add(pool); + spa_activate(spa); + + /* + * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig + * so that we don't try to open the pool if the config is damaged. + */ + error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); + + if (error) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Set the alternate root, if there is one. + */ + if (altroot != NULL) { + atomic_add_32(&spa_active_count, 1); + spa->spa_root = spa_strdup(altroot); + } + + /* + * Initialize the config based on the in-core state. + */ + config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); + + spa_config_set(spa, config); + + /* + * Sync the configuration cache. + */ + spa_config_sync(); + + mutex_exit(&spa_namespace_lock); + + /* + * Resilver anything that's out of date. + */ + if (spa_mode & FWRITE) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + +nvlist_t * +spa_tryimport(nvlist_t *tryconfig) +{ + nvlist_t *config = NULL; + char *poolname; + spa_t *spa; + uint64_t state; + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) + return (NULL); + + if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) + return (NULL); + + mutex_enter(&spa_namespace_lock); + spa = spa_add(TRYIMPORT_NAME); + + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + /* + * Initialize the spa_t structure. + */ + spa_activate(spa); + + /* + * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig + * so we don't try to open the pool if the config is damaged. + */ + (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); + + /* + * If 'tryconfig' was at least parsable, return the current config. + */ + if (spa->spa_root_vdev != NULL) { + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + poolname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + state) == 0); + } + + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + + return (config); +} + +/* + * Pool export/destroy + * + * The act of destroying or exporting a pool is very simple. We make sure there + * is no more pending I/O and any references to the pool are gone. Then, we + * update the pool state and sync all the labels to disk, removing the + * configuration from the cache afterwards. + */ +static int +spa_export_common(char *pool, int new_state) +{ + spa_t *spa; + + if (!(spa_mode & FWRITE)) + return (EROFS); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (ENOENT); + } + + /* + * The pool will be in core if it's openable, + * in which case we can modify its state. + */ + if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + /* + * Objsets may be open only because they're dirty, so we + * have to force it to sync before checking spa_refcnt. + */ + spa_scrub_suspend(spa); + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!spa_refcount_zero(spa)) { + spa_scrub_resume(spa); + mutex_exit(&spa_namespace_lock); + return (EBUSY); + } + + /* + * Update the pool state. + */ + spa->spa_state = new_state; + + spa_scrub_resume(spa); + VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); + + if (spa->spa_root != NULL) + atomic_add_32(&spa_active_count, -1); + + /* + * We want this to be reflected on every label, + * so mark them all dirty. spa_unload() will do the + * final sync that pushes these changes out. + */ + vdev_config_dirty(spa->spa_root_vdev); + } + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + + spa_remove(spa); + spa_config_sync(); + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Destroy a storage pool. + */ +int +spa_destroy(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_DESTROYED)); +} + +/* + * Export a storage pool. + */ +int +spa_export(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_EXPORTED)); +} + +/* + * ========================================================================== + * Device manipulation + * ========================================================================== + */ + +/* + * Add capacity to a storage pool. + */ +int +spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +{ + uint64_t txg; + int c, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + + txg = spa_vdev_enter(spa); + + vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + if (vd == NULL) + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + + if (rvd == NULL) /* spa_create() */ + spa->spa_root_vdev = rvd = vd; + + if ((error = vdev_create(vd, txg)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * Transfer each top-level vdev from the temporary root + * to the spa's root and initialize its metaslabs. + */ + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *tvd = vd->vdev_child[c]; + if (vd != rvd) { + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + } + vdev_init(tvd, txg); + vdev_config_dirty(tvd); + } + + /* + * Update the config based on the new in-core state. + */ + spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); + + return (spa_vdev_exit(spa, vd, txg, 0)); +} + +/* + * Attach a device to a mirror. The arguments are the path to any device + * in the mirror, and the nvroot for the new device. If the path specifies + * a device that is not mirrored, we automatically insert the mirror vdev. + * + * If 'replacing' is specified, the new device is intended to replace the + * existing device; in this case the two devices are made into their own + * mirror using the 'replacing' vdev, which is functionally idendical to + * the mirror vdev (it actually reuses all the same ops) but has a few + * extra rules: you can't attach to it after it's been created, and upon + * completion of resilvering, the first disk (the one being replaced) + * is automatically detached. + */ +int +spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) +{ + uint64_t txg, open_txg; + int error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; + + txg = spa_vdev_enter(spa); + + oldvd = vdev_lookup_by_path(rvd, path); + + if (oldvd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + pvd = oldvd->vdev_parent; + + /* + * The parent must be a mirror or the root, unless we're replacing; + * in that case, the parent can be anything but another replacing vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops && + (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + if (newrootvd == NULL || newrootvd->vdev_children != 1) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + newvd = newrootvd->vdev_child[0]; + + if (!newvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + if ((error = vdev_create(newrootvd, txg)) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, error)); + + if (newvd->vdev_psize < oldvd->vdev_psize) + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + + if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) + return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + + /* + * If this is an in-place replacement, update oldvd's path and devid + * to make it distinguishable from newvd, and unopenable from now on. + */ + if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + spa_strfree(oldvd->vdev_path); + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) sprintf(oldvd->vdev_path, "%s/%s", + newvd->vdev_path, "old"); + if (oldvd->vdev_devid != NULL) { + spa_strfree(oldvd->vdev_devid); + oldvd->vdev_devid = NULL; + } + } + + /* + * If the parent is not a mirror, or if we're replacing, + * insert the new mirror/replacing vdev above oldvd. + */ + if (pvd->vdev_ops != pvops) + pvd = vdev_add_parent(oldvd, pvops); + + ASSERT(pvd->vdev_top->vdev_parent == rvd); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + + /* + * Extract the new device from its root and add it to pvd. + */ + vdev_remove_child(newrootvd, newvd); + newvd->vdev_id = pvd->vdev_children; + vdev_add_child(pvd, newvd); + + tvd = newvd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + /* + * Update the config based on the new in-core state. + */ + spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); + + vdev_config_dirty(tvd); + + /* + * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate + * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + */ + open_txg = txg + TXG_CONCURRENT_STATES - 1; + + mutex_enter(&newvd->vdev_dtl_lock); + space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, + open_txg - TXG_INITIAL + 1); + mutex_exit(&newvd->vdev_dtl_lock); + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, txg); + (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); + + dprintf("attached %s, replacing=%d\n", path, replacing); + + (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + + /* + * Kick off a resilver to update newvd. + */ + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +/* + * Detach a device from a mirror or replacing vdev. + * If 'replace_done' is specified, only detach if the parent + * is a replacing vdev. + */ +int +spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) +{ + uint64_t txg; + int c, t, error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *pvd, *cvd, *tvd; + + txg = spa_vdev_enter(spa); + + vd = vdev_lookup_by_path(rvd, path); + + if (vd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (guid != 0 && vd->vdev_guid != guid) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + pvd = vd->vdev_parent; + + /* + * If replace_done is specified, only remove this device if it's + * the first child of a replacing vdev. + */ + if (replace_done && + (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * Only mirror and replacing vdevs support detach. + */ + if (pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_mirror_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * If there's only one replica, you can't detach it. + */ + if (pvd->vdev_children <= 1) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * If all siblings have non-empty DTLs, this device may have the only + * valid copy of the data, which means we cannot safely detach it. + * + * XXX -- as in the vdev_offline() case, we really want a more + * precise DTL check. + */ + for (c = 0; c < pvd->vdev_children; c++) { + uint64_t dirty; + + cvd = pvd->vdev_child[c]; + if (cvd == vd) + continue; + if (vdev_is_dead(cvd)) + continue; + mutex_enter(&cvd->vdev_dtl_lock); + dirty = cvd->vdev_dtl_map.sm_space | + cvd->vdev_dtl_scrub.sm_space; + mutex_exit(&cvd->vdev_dtl_lock); + if (!dirty) + break; + } + if (c == pvd->vdev_children) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + error = vdev_label_init(vd, 0); + if (error) + dprintf("unable to erase labels on %s\n", vdev_description(vd)); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + cvd = pvd->vdev_child[0]; + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) + vdev_remove_parent(cvd); + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reopen this top-level vdev to reassess health after detach. + */ + vdev_reopen(tvd, NULL); + + /* + * If the device we just detached was smaller than the others, + * it may be possible to add metaslabs (i.e. grow the pool). + */ + vdev_metaslab_init(tvd, txg); + + /* + * Update the config based on the new in-core state. + */ + spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. + * vdev_dtl_sync() will see that vd->vdev_detached is set + * and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, + * to prevent vd from being accessed after it's freed. + */ + vdev_dirty(tvd, VDD_DTL, txg); + vd->vdev_detached = B_TRUE; + for (t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); + + dprintf("detached %s\n", path); + + return (spa_vdev_exit(spa, vd, txg, 0)); +} + +/* + * If there are any replacing vdevs that have finished replacing, detach them. + * We can't hold the config lock across detaches, so we lock the config, + * build a list of candidates, unlock the config, and try each candidate. + */ +typedef struct vdev_detach_link { + char *vdl_path; + uint64_t vdl_guid; + list_node_t vdl_node; +} vdev_detach_link_t; + +static void +spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); + + if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + vdev_t *cvd0 = vd->vdev_child[0]; + vdev_t *cvd1 = vd->vdev_child[1]; + vdev_detach_link_t *vdl; + int dirty1; + + mutex_enter(&cvd1->vdev_dtl_lock); + dirty1 = cvd1->vdev_dtl_map.sm_space | + cvd1->vdev_dtl_scrub.sm_space; + mutex_exit(&cvd1->vdev_dtl_lock); + + if (!dirty1) { + vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); + vdl->vdl_path = spa_strdup(cvd0->vdev_path); + vdl->vdl_guid = cvd0->vdev_guid; + list_insert_tail(l, vdl); + } + } +} + +void +spa_vdev_replace_done(spa_t *spa) +{ + vdev_detach_link_t *vdl; + list_t vdlist; + + list_create(&vdlist, sizeof (vdev_detach_link_t), + offsetof(vdev_detach_link_t, vdl_node)); + + spa_config_enter(spa, RW_READER); + spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); + spa_config_exit(spa); + + while ((vdl = list_head(&vdlist)) != NULL) { + list_remove(&vdlist, vdl); + (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, + B_TRUE); + spa_strfree(vdl->vdl_path); + kmem_free(vdl, sizeof (*vdl)); + } + + list_destroy(&vdlist); +} + +/* + * ========================================================================== + * SPA Scrubbing + * ========================================================================== + */ + +static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); + +static void +spa_scrub_io_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + if (zio->io_error) + spa->spa_scrub_errors++; + if (--spa->spa_scrub_inflight == 0) + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + if (zio->io_error) { + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_errors++; + mutex_exit(&vd->vdev_stat_lock); + } +} + +static void +spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) +{ + size_t size = BP_GET_LSIZE(bp); + void *data = zio_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + spa_scrub_io_done, NULL, priority, flags)); +} + +/* ARGSUSED */ +static int +spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) +{ + blkptr_t *bp = &bc->bc_blkptr; + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); + + if (bc->bc_errno || vd == NULL) { + /* + * We can't scrub this block, but we can continue to scrub + * the rest of the pool. Note the error and move along. + */ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_errors++; + mutex_exit(&spa->spa_scrub_lock); + + if (vd != NULL) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + return (ERESTART); + } + + ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); + mutex_exit(&vd->vdev_stat_lock); + + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { + if (DVA_GET_GANG(&bp->blk_dva[0])) { + /* + * Gang members may be spread across multiple vdevs, + * so the best we can do is look at the pool-wide DTL. + * XXX -- it would be better to change our allocation + * policy to ensure that this can't happen. + */ + vd = spa->spa_root_vdev; + } + if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { + spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_RESILVER); + } + } else { + spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); + } + + return (0); +} + +static void +spa_scrub_thread(spa_t *spa) +{ + callb_cpr_t cprinfo; + traverse_handle_t *th = spa->spa_scrub_th; + vdev_t *rvd = spa->spa_root_vdev; + pool_scrub_type_t scrub_type = spa->spa_scrub_type; + int error = 0; + boolean_t complete; + + CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); + + spa_config_enter(spa, RW_WRITER); + vdev_reopen(rvd, NULL); /* purge all vdev caches */ + vdev_config_dirty(rvd); /* rewrite all disk labels */ + vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); + spa_config_exit(spa); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_errors = 0; + spa->spa_scrub_active = 1; + + while (!spa->spa_scrub_stop) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_scrub_suspend) { + spa->spa_scrub_active = 0; + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + spa->spa_scrub_active = 1; + } + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); + + if (spa->spa_scrub_restart_txg != 0) + break; + + mutex_exit(&spa->spa_scrub_lock); + error = traverse_more(th); + mutex_enter(&spa->spa_scrub_lock); + if (error != EAGAIN) + break; + } + + while (spa->spa_scrub_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + + if (spa->spa_scrub_restart_txg != 0) + error = ERESTART; + + spa->spa_scrub_active = 0; + cv_broadcast(&spa->spa_scrub_cv); + + /* + * If the traverse completed, and there were no errors, + * then the scrub was completely successful. + */ + complete = (error == 0 && spa->spa_scrub_errors == 0); + + dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", + spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", + error, spa->spa_scrub_errors, spa->spa_scrub_stop); + + mutex_exit(&spa->spa_scrub_lock); + + /* + * If the scrub/resilver completed, update all DTLs to reflect this. + * Whether it succeeded or not, vacate all temporary scrub DTLs. + */ + spa_config_enter(spa, RW_WRITER); + vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, + complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); + spa_config_exit(spa); + + spa_vdev_replace_done(spa); + + spa_config_enter(spa, RW_READER); + vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); + spa_config_exit(spa); + + mutex_enter(&spa->spa_scrub_lock); + + spa->spa_scrub_type = POOL_SCRUB_NONE; + spa->spa_scrub_active = 0; + spa->spa_scrub_thread = NULL; + + cv_broadcast(&spa->spa_scrub_cv); + + /* + * If we were told to restart, our final act is to start a new scrub. + */ + if (error == ERESTART) + VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); + + CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ + thread_exit(); +} + +void +spa_scrub_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_suspend++; + while (spa->spa_scrub_active) { + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + } + while (spa->spa_scrub_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + mutex_exit(&spa->spa_scrub_lock); +} + +void +spa_scrub_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_scrub_lock); + ASSERT(spa->spa_scrub_suspend != 0); + if (--spa->spa_scrub_suspend == 0) + cv_broadcast(&spa->spa_scrub_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +void +spa_scrub_restart(spa_t *spa, uint64_t txg) +{ + /* + * Something happened (e.g. snapshot create/delete) that means + * we must restart any in-progress scrubs. The itinerary will + * fix this properly. + */ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_restart_txg = txg; + mutex_exit(&spa->spa_scrub_lock); +} + +static int +spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) +{ + space_seg_t *ss; + uint64_t mintxg, maxtxg; + vdev_t *rvd = spa->spa_root_vdev; + int advance = 0; + + if ((uint_t)type >= POOL_SCRUB_TYPES) + return (ENOTSUP); + + /* + * If there's a scrub or resilver already in progress, stop it. + */ + while (spa->spa_scrub_thread != NULL) { + /* + * Don't stop a resilver unless forced. + */ + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) + return (EBUSY); + + spa->spa_scrub_stop = 1; + cv_broadcast(&spa->spa_scrub_cv); + cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); + } + + /* + * Terminate the previous traverse. + */ + if (spa->spa_scrub_th != NULL) { + traverse_fini(spa->spa_scrub_th); + spa->spa_scrub_th = NULL; + } + + spa->spa_scrub_stop = 0; + spa->spa_scrub_type = type; + spa->spa_scrub_restart_txg = 0; + + mintxg = TXG_INITIAL - 1; + maxtxg = spa_last_synced_txg(spa) + 1; + + switch (type) { + + case POOL_SCRUB_NONE: + break; + + case POOL_SCRUB_RESILVER: + /* + * Determine the resilvering boundaries. + * + * Note: (mintxg, maxtxg) is an open interval, + * i.e. mintxg and maxtxg themselves are not included. + * + * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 + * so we don't claim to resilver a txg that's still changing. + */ + mutex_enter(&rvd->vdev_dtl_lock); + ss = avl_first(&rvd->vdev_dtl_map.sm_root); + mintxg = ss ? ss->ss_start - 1 : 0; + ss = avl_last(&rvd->vdev_dtl_map.sm_root); + maxtxg = ss ? ss->ss_end : 0; + maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); + mutex_exit(&rvd->vdev_dtl_lock); + + advance = ADVANCE_PRE | ADVANCE_PRUNE; + break; + + case POOL_SCRUB_EVERYTHING: + /* + * A scrub is like a resilver, but not pruned by DTL. + */ + advance = ADVANCE_PRE; + break; + } + + if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { + spa->spa_scrub_maxtxg = maxtxg; + spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, + advance, ZIO_FLAG_CANFAIL); + traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); + spa->spa_scrub_thread = thread_create(NULL, 0, + spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); + } + + return (0); +} + +int +spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) +{ + int error; + traverse_handle_t *th; + + mutex_enter(&spa->spa_scrub_lock); + error = spa_scrub_locked(spa, type, force); + th = spa->spa_scrub_th; + mutex_exit(&spa->spa_scrub_lock); + + if (th == NULL && type != POOL_SCRUB_NONE) + spa_vdev_replace_done(spa); + + return (error); +} + +/* + * ========================================================================== + * SPA syncing routines + * ========================================================================== + */ + +static void +spa_sync_deferred_frees(spa_t *spa, uint64_t txg) +{ + bplist_t *bpl = &spa->spa_sync_bplist; + dmu_tx_t *tx; + blkptr_t blk; + uint64_t itor = 0; + zio_t *zio; + int error; + uint8_t c = 1; + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); + + while (bplist_iterate(bpl, &itor, &blk) == 0) + zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); + + error = zio_wait(zio); + ASSERT3U(error, ==, 0); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + bplist_vacate(bpl, tx); + + /* + * Pre-dirty the first block so we sync to convergence faster. + * (Usually only the first block is needed.) + */ + dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); + dmu_tx_commit(tx); +} + +static void +spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *config; + char *packed = NULL; + size_t nvsize = 0; + dmu_buf_t *db; + + if (list_is_empty(&spa->spa_dirty_list)) + return; + + config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); + + spa_config_set(spa, config); + + VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); + + packed = kmem_alloc(nvsize, KM_SLEEP); + + VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); + + dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, + packed, tx); + + kmem_free(packed, nvsize); + + db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db); +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + objset_t *mos = spa->spa_meta_objset; + bplist_t *bpl = &spa->spa_sync_bplist; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd; + dmu_tx_t *tx; + int dirty_vdevs; + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + bplist_open(bpl, mos, spa->spa_sync_bplist_obj); + + /* + * If anything has changed in this txg, push the deferred frees + * from the previous txg. If not, leave them alone so that we + * don't generate work on an otherwise idle system. + */ + if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || + !txg_list_empty(&dp->dp_dirty_dirs, txg)) + spa_sync_deferred_frees(spa, txg); + + /* + * Iterate to convergence. + */ + do { + spa->spa_sync_pass++; + + tx = dmu_tx_create_assigned(dp, txg); + spa_sync_config_object(spa, tx); + dmu_tx_commit(tx); + + dsl_pool_sync(dp, txg); + + dirty_vdevs = 0; + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { + vdev_sync(vd, txg); + dirty_vdevs++; + } + + tx = dmu_tx_create_assigned(dp, txg); + bplist_sync(bpl, tx); + dmu_tx_commit(tx); + + } while (dirty_vdevs); + + bplist_close(bpl); + + dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); + + /* + * Rewrite the vdev configuration (which includes the uberblock) + * to commit the transaction group. + */ + while (spa_sync_labels(spa, txg)) { + dprintf("waiting for devices to heal\n"); + delay(hz); + vdev_reopen(rvd, NULL); + } + + /* + * Make a stable copy of the fully synced uberblock. + * We use this as the root for pool traversals. + */ + spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ + + spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ + + rw_enter(&spa->spa_traverse_lock, RW_WRITER); + spa->spa_traverse_wanted = 0; + spa->spa_ubsync = spa->spa_uberblock; + rw_exit(&spa->spa_traverse_lock); + + spa_scrub_resume(spa); /* resume scrub with new ubsync */ + + /* + * Clean up the ZIL records for the synced txg. + */ + dsl_pool_zil_clean(dp); + + /* + * Update usable space statistics. + */ + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + vdev_sync_done(vd, txg); + + /* + * It had better be the case that we didn't dirty anything + * since spa_sync_labels(). + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + ASSERT(bpl->bpl_queue == NULL); + + spa_config_exit(spa); +} + +/* + * Sync all pools. We don't want to hold the namespace lock across these + * operations, so we take a reference on the spa_t and drop the lock during the + * sync. + */ +void +spa_sync_allpools(void) +{ + spa_t *spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); +} + +/* + * ========================================================================== + * Miscellaneous routines + * ========================================================================== + */ + +int +spa_busy(void) +{ + return (spa_active_count != 0); +} + +/* + * Remove all pools in the system. + */ +void +spa_evict_all(void) +{ + spa_t *spa; + + /* + * Remove all cached state. All pools should be closed now, + * so every spa in the AVL tree should be unreferenced. + */ + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(NULL)) != NULL) { + /* + * Stop all scrub and resilver activity. spa_scrub() needs to + * wait for the scrub thread, which may do a detach and sync the + * configs, which needs spa_namespace_lock. Drop the lock while + * maintaining a hold on the spa_t. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); +} diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c new file mode 100644 index 0000000000..abcd67ddb9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -0,0 +1,308 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/nvpair.h> +#include <sys/uio.h> +#include <sys/fs/zfs.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_ioctl.h> + +/* + * Pool configuration repository. + * + * The configuration for all pools, in addition to being stored on disk, is + * stored in /kernel/drv/zpool.cache as a packed nvlist. The kernel maintains + * this list as pools are created, destroyed, or modified. + * + * We have a single nvlist which holds all the configuration information. When + * the module loads, we read this information from the cache and populate the + * SPA namespace. This namespace is maintained independently in spa.c. + * Whenever the namespace is modified, or the configuration of a pool is + * changed, we call spa_config_sync(), which walks through all the active pools + * and writes the configuration to disk. + */ + +static uint64_t spa_config_generation = 1; + +/* + * This can be overridden in userland to preserve an alternate namespace for + * userland pools when doing testing. + */ +const char *spa_config_dir = ZPOOL_CACHE_DIR; + +/* + * Called when the module is first loaded, this routine loads the configuration + * file into the SPA namespace. It does not actually open or load the pools; it + * only populates the namespace. + */ +void +spa_config_load(void) +{ + vnode_t *vp; + void *buf = NULL; + vattr_t vattr; + ssize_t resid; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + spa_t *spa; + char pathname[128]; + + /* + * Open the configuration file. + */ + (void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir, + ZPOOL_CACHE_FILE); + if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0, + rootdir) != 0) + return; + + /* + * Read the nvlist from the file. + */ + if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0) + goto out; + + buf = kmem_alloc(vattr.va_size, KM_SLEEP); + + if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid) != 0) + goto out; + + if (resid != 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + spa = spa_add(nvpair_name(nvpair)); + + /* + * We blindly duplicate the configuration here. If it's + * invalid, we will catch it when the pool is first opened. + */ + VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, vattr.va_size); + + (void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred); + VN_RELE(vp); +} + +/* + * Synchronize all pools to disk. This must be called with the namespace lock + * held. + */ +void +spa_config_sync(void) +{ + spa_t *spa = NULL; + nvlist_t *config; + size_t buflen; + char *buf; + vnode_t *vp; + int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; + char pathname[128]; + char pathname2[128]; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + + /* + * Add all known pools to the configuration list, ignoring those with + * alternate root paths. + */ + spa = NULL; + while ((spa = spa_next(spa)) != NULL) { + mutex_enter(&spa->spa_config_cache_lock); + if (spa->spa_config && spa->spa_name && spa->spa_root == NULL) + VERIFY(nvlist_add_nvlist(config, spa->spa_name, + spa->spa_config) == 0); + mutex_exit(&spa->spa_config_cache_lock); + } + + /* + * Pack the configuration into a buffer. + */ + VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0); + + buf = kmem_alloc(buflen, KM_SLEEP); + + VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0); + + /* + * Write the configuration to disk. We need to do the traditional + * 'write to temporary file, sync, move over original' to make sure we + * always have a consistent view of the data. + */ + (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir, + ZPOOL_CACHE_TMP); + + if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0) + goto out; + + if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL) == 0 && + VOP_FSYNC(vp, FSYNC, kcred) == 0) { + (void) snprintf(pathname2, sizeof (pathname2), "%s/%s", + spa_config_dir, ZPOOL_CACHE_FILE); + (void) vn_rename(pathname, pathname2, UIO_SYSSPACE); + } + + (void) VOP_CLOSE(vp, oflags, 1, 0, kcred); + VN_RELE(vp); + +out: + (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE); + spa_config_generation++; + + kmem_free(buf, buflen); + nvlist_free(config); +} + +/* + * Sigh. Inside a local zone, we don't have access to /kernel/drv/zpool.cache, + * and we don't want to allow the local zone to see all the pools anyway. + * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration + * information for all pool visible within the zone. + */ +nvlist_t * +spa_all_configs(uint64_t *generation) +{ + nvlist_t *pools; + spa_t *spa; + + if (*generation == spa_config_generation) + return (NULL); + + VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0); + + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (INGLOBALZONE(curproc) || + zone_dataset_visible(spa_name(spa), NULL)) { + mutex_enter(&spa->spa_config_cache_lock); + VERIFY(nvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config) == 0); + mutex_exit(&spa->spa_config_cache_lock); + } + } + mutex_exit(&spa_namespace_lock); + + *generation = spa_config_generation; + + return (pools); +} + +void +spa_config_set(spa_t *spa, nvlist_t *config) +{ + mutex_enter(&spa->spa_config_cache_lock); + if (spa->spa_config != NULL) + nvlist_free(spa->spa_config); + spa->spa_config = config; + mutex_exit(&spa->spa_config_cache_lock); +} + +/* + * Generate the pool's configuration based on the current in-core state. + * We infer whether to generate a complete config or just one top-level config + * based on whether vd is the root vdev. + */ +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + nvlist_t *config, *nvroot; + vdev_t *rvd = spa->spa_root_vdev; + + if (vd == NULL) + vd = rvd; + + /* + * If txg is -1, report the current value of spa->spa_config_txg. + * If txg is any other non-zero value, update spa->spa_config_txg. + */ + if (txg == -1ULL) + txg = spa->spa_config_txg; + else if (txg != 0 && vd == rvd) + spa->spa_config_txg = txg; + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + UBERBLOCK_VERSION) == 0); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + spa_name(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + spa_state(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_guid(spa)) == 0); + + if (vd != rvd) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + vd = vd->vdev_top; /* label contains top config */ + } + + nvroot = vdev_config_generate(vd, getstats); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + + return (config); +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c new file mode 100644 index 0000000000..c1b6017509 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -0,0 +1,848 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab.h> +#include <sys/uberblock_impl.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/unique.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/fs/zfs.h> + +/* + * SPA locking + * + * There are four basic locks for managing spa_t structures: + * + * spa_namespace_lock (global mutex) + * + * This lock must be acquired to do any of the following: + * + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t + * - Held for the duration of create/destroy/import/export + * + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. + * + * spa_refcount (per-spa refcount_t protected by mutex) + * + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against SPA_MINREF, but + * present the image of a zero/non-zero value to consumers. + * + * spa_config_lock (per-spa crazy rwlock) + * + * This SPA special is a recursive rwlock, capable of being acquired from + * asynchronous threads. It has protects the spa_t from config changes, + * and must be held in the following circumstances: + * + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config + * + * spa_config_cache_lock (per-spa mutex) + * + * This mutex prevents the spa_config nvlist from being updated. No + * other locks are required to obtain this lock, although implicitly you + * must have the namespace lock or non-zero refcount to have any kind + * of spa_t pointer at all. + * + * spa_vdev_lock (global mutex) + * + * This special lock is a global mutex used to serialize attempts to + * access devices through ZFS. It makes sure that we do not try to add + * a single vdev to multiple pools at the same time. It must be held + * when adding or removing a device from the pool. + * + * + * The locking order is fairly straightforward: + * + * spa_namespace_lock -> spa_refcount + * + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. + * + * spa_refcount -> spa_config_lock + * + * There must be at least one valid reference on the spa_t to acquire + * the config lock. + * + * spa_vdev_lock -> spa_config_lock + * + * There are no locks required for spa_vdev_lock, but it must be + * acquired before spa_config_lock. + * + * + * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and + * are globally visible. + * + * The namespace is manipulated using the following functions, all which require + * the spa_namespace_lock to be held. + * + * spa_lookup() Lookup a spa_t by name. + * + * spa_add() Create a new spa_t in the namespace. + * + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. + * + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. + * + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. + * + * + * The spa_refcount is manipulated using the following functions: + * + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. + * + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. + * + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. + * + * The spa_config_lock is manipulated using the following functions: + * + * spa_config_enter() Acquire the config lock as RW_READER or + * RW_WRITER. At least one reference on the spa_t + * must exist. + * + * spa_config_exit() Release the config lock. + * + * spa_config_held() Returns true if the config lock is currently + * held in the given state. + * + * The spa_vdev_lock, while acquired directly, is hidden by the following + * functions, which imply additional semantics that must be followed: + * + * spa_vdev_enter() Acquire the vdev lock and the config lock for + * writing. + * + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, release the vdev lock, and sync + * the updated configs to the cache. + * + * The spa_name() function also requires either the spa_namespace_lock + * or the spa_config_lock, as both are needed to do a rename. spa_rename() is + * also implemented within this file since is requires manipulation of the + * namespace. + */ + +static avl_tree_t spa_namespace_avl; +kmutex_t spa_namespace_lock; +static kcondvar_t spa_namespace_cv; + +kmem_cache_t *spa_buffer_pool; +int spa_mode; + +#ifdef ZFS_DEBUG +int zfs_flags = ~0; +#else +int zfs_flags = 0; +#endif + +static kmutex_t spa_vdev_lock; + +#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */ + +/* + * ========================================================================== + * SPA namespace functions + * ========================================================================== + */ + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t search, *spa; + avl_index_t where; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + search.spa_name = (char *)name; + spa = avl_find(&spa_namespace_avl, &search, &where); + + return (spa); +} + +/* + * Create an uninitialized spa_t with the given name. Requires + * spa_namespace_lock. The caller must ensure that the spa_t doesn't already + * exist by calling spa_lookup() first. + */ +spa_t * +spa_add(const char *name) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + + spa->spa_name = spa_strdup(name); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + + refcount_create(&spa->spa_refcount); + + avl_add(&spa_namespace_avl, spa); + + return (spa); +} + +/* + * Removes a spa_t from the namespace, freeing up any memory used. Requires + * spa_namespace_lock. This is called only after the spa_t has been closed and + * deactivated. + */ +void +spa_remove(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT(spa->spa_scrub_thread == NULL); + + avl_remove(&spa_namespace_avl, spa); + cv_broadcast(&spa_namespace_cv); + + if (spa->spa_root) + spa_strfree(spa->spa_root); + + if (spa->spa_name) + spa_strfree(spa->spa_name); + + spa_config_set(spa, NULL); + + refcount_destroy(&spa->spa_refcount); + + kmem_free(spa, sizeof (spa_t)); +} + +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next(spa_t *prev) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + +/* + * ========================================================================== + * SPA refcount functions + * ========================================================================== + */ + +/* + * Add a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_open_ref(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + MUTEX_HELD(&spa_namespace_lock)); + + (void) refcount_add(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_close(spa_t *spa, void *tag) +{ + ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + MUTEX_HELD(&spa_namespace_lock)); + + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Check to see if the spa refcount is zero. Must be called with + * spa_namespace_lock held. We really compare against SPA_MINREF, which is the + * number of references acquired when opening a pool + */ +boolean_t +spa_refcount_zero(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + return (refcount_count(&spa->spa_refcount) == SPA_MINREF); +} + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ + +/* + * Acquire the config lock. The config lock is a special rwlock that allows for + * recursive enters. Because these enters come from the same thread as well as + * asynchronous threads working on behalf of the owner, we must unilaterally + * allow all reads access as long at least one reader is held (even if a write + * is requested). This has the side effect of write starvation, but write locks + * are extremely rare, and a solution to this problem would be significantly + * more complex (if even possible). + * + * We would like to assert that the namespace lock isn't held, but this is a + * valid use during create. + */ +void +spa_config_enter(spa_t *spa, krw_t rw) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + if (scl->scl_writer != curthread) { + if (rw == RW_READER) { + while (scl->scl_writer != NULL) + cv_wait(&scl->scl_cv, &scl->scl_lock); + } else { + while (scl->scl_writer != NULL || scl->scl_count > 0) + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_writer = curthread; + } + } + + scl->scl_count++; + + mutex_exit(&scl->scl_lock); +} + +/* + * Release the spa config lock, notifying any waiters in the process. + */ +void +spa_config_exit(spa_t *spa) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + ASSERT(scl->scl_count > 0); + if (--scl->scl_count == 0) { + cv_broadcast(&scl->scl_cv); + scl->scl_writer = NULL; /* OK in either case */ + } + + mutex_exit(&scl->scl_lock); +} + +/* + * Returns true if the config lock is held in the given manner. + */ +boolean_t +spa_config_held(spa_t *spa, krw_t rw) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + boolean_t held; + + mutex_enter(&scl->scl_lock); + if (rw == RW_WRITER) + held = (scl->scl_writer == curthread); + else + held = (scl->scl_count != 0); + mutex_exit(&scl->scl_lock); + + return (held); +} + +/* + * ========================================================================== + * SPA vdev locking + * ========================================================================== + */ + +/* + * Lock the given spa_t for the purpose of adding or removing a vdev. This + * grabs the global spa_vdev_lock as well as the spa config lock for writing. + * It returns the next transaction group for the spa_t. + */ +uint64_t +spa_vdev_enter(spa_t *spa) +{ + mutex_enter(&spa_vdev_lock); + + spa_config_enter(spa, RW_WRITER); + + return (spa_last_synced_txg(spa) + 1); +} + +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + + spa_config_exit(spa); + + if (vd == spa->spa_root_vdev) { /* spa_create() */ + mutex_exit(&spa_vdev_lock); + return (error); + } + + /* + * Note: this txg_wait_synced() is important because it ensures + * that there won't be more than one config change per txg. + * This allows us to use the txg as the generation number. + */ + if (error == 0) + txg_wait_synced(spa->spa_dsl_pool, txg); + + mutex_exit(&spa_vdev_lock); + + if (vd != NULL) { + ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + vdev_free(vd); + } + + /* + * If we're in the middle of export or destroy, don't sync the + * config -- it will do that anyway, and we deadlock if we try. + */ + if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) { + mutex_enter(&spa_namespace_lock); + spa_config_sync(); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * ========================================================================== + * Miscellaneous functions + * ========================================================================== + */ + +/* + * Rename a spa_t. + */ +int +spa_rename(const char *name, const char *newname) +{ + spa_t *spa; + int err; + + /* + * Lookup the spa_t and grab the config lock for writing. We need to + * actually open the pool so that we can sync out the necessary labels. + * It's OK to call spa_open() with the namespace lock held because we + * alllow recursive calls for other reasons. + */ + mutex_enter(&spa_namespace_lock); + if ((err = spa_open(name, &spa, FTAG)) != 0) { + mutex_exit(&spa_namespace_lock); + return (err); + } + + spa_config_enter(spa, RW_WRITER); + + avl_remove(&spa_namespace_avl, spa); + spa_strfree(spa->spa_name); + spa->spa_name = spa_strdup(newname); + avl_add(&spa_namespace_avl, spa); + + /* + * Sync all labels to disk with the new names by marking the root vdev + * dirty and waiting for it to sync. It will pick up the new pool name + * during the sync. + */ + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa); + + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * Sync the updated config cache. + */ + spa_config_set(spa, + spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0)); + spa_config_sync(); + + spa_close(spa, FTAG); + + mutex_exit(&spa_namespace_lock); + + return (0); +} + + +/* + * Determine whether a pool with given pool_guid exists. If device_guid is + * non-zero, determine whether the pool exists *and* contains a device with the + * specified device_guid. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + spa_t *spa; + avl_tree_t *t = &spa_namespace_avl; + boolean_t locked = B_FALSE; + + if (mutex_owner(&spa_namespace_lock) != curthread) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + continue; + if (spa->spa_root_vdev == NULL) + continue; + if (spa_guid(spa) == pool_guid && (device_guid == 0 || + vdev_lookup_by_guid(spa->spa_root_vdev, device_guid))) + break; + } + + if (locked) + mutex_exit(&spa_namespace_lock); + + return (spa != NULL); +} + +char * +spa_strdup(const char *s) +{ + size_t len; + char *new; + + len = strlen(s); + new = kmem_alloc(len + 1, KM_SLEEP); + bcopy(s, new, len); + new[len] = '\0'; + + return (new); +} + +void +spa_strfree(char *s) +{ + kmem_free(s, strlen(s) + 1); +} + +uint64_t +spa_get_random(uint64_t range) +{ + uint64_t r; + + ASSERT(range != 0); + + (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); + + return (r % range); +} + +void +sprintf_blkptr(char *buf, blkptr_t *bp) +{ + /* XXBP - Need to see if we want all DVAs or not */ + dva_t *dva = BP_IDENTITY(bp); + + if (bp == NULL) { + (void) sprintf(buf, "<NULL>"); + return; + } + + if (BP_IS_HOLE(bp)) { + (void) sprintf(buf, "<hole>"); + return; + } + + (void) sprintf(buf, "[L%llu %s] vdev=%llu offset=%llx " + "size=%llxL/%llxP/%llxA %s %s %s %s", + (u_longlong_t)BP_GET_LEVEL(bp), + dmu_ot[BP_GET_TYPE(bp)].ot_name, + (u_longlong_t)DVA_GET_VDEV(dva), + (u_longlong_t)DVA_GET_OFFSET(dva), + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)DVA_GET_ASIZE(dva), + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, + zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", + DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang"); + + (void) sprintf(buf + strlen(buf), " birth=%llu fill=%llu" + " cksum=%llx:%llx:%llx:%llx", + (u_longlong_t)bp->blk_birth, + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); +} + +void +spa_freeze(spa_t *spa) +{ + uint64_t freeze_txg = 0; + + spa_config_enter(spa, RW_WRITER); + if (spa->spa_freeze_txg == UINT64_MAX) { + freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; + spa->spa_freeze_txg = freeze_txg; + } + spa_config_exit(spa); + if (freeze_txg != 0) + txg_wait_synced(spa_get_dsl(spa), freeze_txg); +} + +/* + * ========================================================================== + * Accessor functions + * ========================================================================== + */ + +krwlock_t * +spa_traverse_rwlock(spa_t *spa) +{ + return (&spa->spa_traverse_lock); +} + +int +spa_traverse_wanted(spa_t *spa) +{ + return (spa->spa_traverse_wanted); +} + +dsl_pool_t * +spa_get_dsl(spa_t *spa) +{ + return (spa->spa_dsl_pool); +} + +blkptr_t * +spa_get_rootblkptr(spa_t *spa) +{ + return (&spa->spa_ubsync.ub_rootbp); +} + +void +spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) +{ + spa->spa_uberblock.ub_rootbp = *bp; +} + +void +spa_altroot(spa_t *spa, char *buf, size_t buflen) +{ + if (spa->spa_root == NULL) + buf[0] = '\0'; + else + (void) strncpy(buf, spa->spa_root, buflen); +} + +int +spa_sync_pass(spa_t *spa) +{ + return (spa->spa_sync_pass); +} + +char * +spa_name(spa_t *spa) +{ + /* + * Accessing the name requires holding either the namespace lock or the + * config lock, both of which are required to do a rename. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); + + return (spa->spa_name); +} + +uint64_t +spa_guid(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_guid); +} + +uint64_t +spa_last_synced_txg(spa_t *spa) +{ + return (spa->spa_ubsync.ub_txg); +} + +uint64_t +spa_first_txg(spa_t *spa) +{ + return (spa->spa_first_txg); +} + +int +spa_state(spa_t *spa) +{ + return (spa->spa_state); +} + +uint64_t +spa_freeze_txg(spa_t *spa) +{ + return (spa->spa_freeze_txg); +} + +/* + * In the future, this may select among different metaslab classes + * depending on the zdp. For now, there's no such distinction. + */ +metaslab_class_t * +spa_metaslab_class_select(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +/* + * Return pool-wide allocated space. + */ +uint64_t +spa_get_alloc(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_alloc); +} + +/* + * Return pool-wide allocated space. + */ +uint64_t +spa_get_space(spa_t *spa) +{ + return (spa->spa_root_vdev->vdev_stat.vs_space); +} + +/* ARGSUSED */ +uint64_t +spa_get_asize(spa_t *spa, uint64_t lsize) +{ + /* + * For now, the worst case is 512-byte RAID-Z blocks, in which + * case the space requirement is exactly 2x; so just assume that. + */ + return (lsize << 1); +} + +/* + * ========================================================================== + * Initialization and Termination + * ========================================================================== + */ + +static int +spa_name_compare(const void *a1, const void *a2) +{ + const spa_t *s1 = a1; + const spa_t *s2 = a2; + int s; + + s = strcmp(s1->spa_name, s2->spa_name); + if (s > 0) + return (1); + if (s < 0) + return (-1); + return (0); +} + +void +spa_init(int mode) +{ + mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); + + avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), + offsetof(spa_t, spa_avl)); + + spa_mode = mode; + + refcount_init(); + unique_init(); + zio_init(); + dmu_init(); + zil_init(); + spa_config_load(); +} + +void +spa_fini(void) +{ + spa_evict_all(); + + zil_fini(); + dmu_fini(); + zio_fini(); + refcount_fini(); + + avl_destroy(&spa_namespace_avl); + + cv_destroy(&spa_namespace_cv); + mutex_destroy(&spa_namespace_lock); +} diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c new file mode 100644 index 0000000000..25f66bf94b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/space_map.h> + +/* + * Space map routines. + * NOTE: caller is responsible for all locking. + */ +static int +space_map_seg_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + + if (s1->ss_start < s2->ss_start) { + if (s1->ss_end > s2->ss_start) + return (0); + return (-1); + } + if (s1->ss_start > s2->ss_start) { + if (s1->ss_start < s2->ss_end) + return (0); + return (1); + } + return (0); +} + +void +space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift, + kmutex_t *lp) +{ + avl_create(&sm->sm_root, space_map_seg_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); + sm->sm_start = start; + sm->sm_end = start + size; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_space = 0; + sm->sm_lock = lp; +} + +void +space_map_destroy(space_map_t *sm) +{ + VERIFY3U(sm->sm_space, ==, 0); + avl_destroy(&sm->sm_root); +} + +void +space_map_add(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss_before, *ss_after, *ss; + uint64_t end = start + size; + int merge_before, merge_after; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY3U(start, >=, sm->sm_start); + VERIFY3U(end, <=, sm->sm_end); + VERIFY(sm->sm_space + size <= sm->sm_size); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + /* Make sure we don't overlap with either of our neighbors */ + VERIFY(ss == NULL); + + ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); + ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); + + merge_before = (ss_before != NULL && ss_before->ss_end == start); + merge_after = (ss_after != NULL && ss_after->ss_start == end); + + if (merge_before && merge_after) { + avl_remove(&sm->sm_root, ss_before); + ss_after->ss_start = ss_before->ss_start; + kmem_free(ss_before, sizeof (*ss_before)); + } else if (merge_before) { + ss_before->ss_end = end; + } else if (merge_after) { + ss_after->ss_start = start; + } else { + ss = kmem_alloc(sizeof (*ss), KM_SLEEP); + ss->ss_start = start; + ss->ss_end = end; + avl_insert(&sm->sm_root, ss, where); + } + + sm->sm_space += size; +} + +void +space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss, *newseg; + uint64_t end = start + size; + int left_over, right_over; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + /* Make sure we completely overlap with someone */ + VERIFY(ss != NULL); + VERIFY3U(ss->ss_start, <=, start); + VERIFY3U(ss->ss_end, >=, end); + VERIFY(sm->sm_space - size <= sm->sm_size); + + left_over = (ss->ss_start != start); + right_over = (ss->ss_end != end); + + if (left_over && right_over) { + newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); + newseg->ss_start = end; + newseg->ss_end = ss->ss_end; + ss->ss_end = start; + avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + } else if (left_over) { + ss->ss_end = start; + } else if (right_over) { + ss->ss_start = end; + } else { + avl_remove(&sm->sm_root, ss); + kmem_free(ss, sizeof (*ss)); + } + + sm->sm_space -= size; +} + +int +space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; + space_seg_t ssearch, *ss; + uint64_t end = start + size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(size != 0); + VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); + VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); + + ssearch.ss_start = start; + ssearch.ss_end = end; + ss = avl_find(&sm->sm_root, &ssearch, &where); + + return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); +} + +void +space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + if (func != NULL) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); + kmem_free(ss, sizeof (*ss)); + } + sm->sm_space = 0; +} + +void +space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) +{ + space_seg_t *ss; + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); +} + +void +space_map_merge(space_map_t *src, space_map_t *dest) +{ + space_map_vacate(src, space_map_add, dest); +} + +void +space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, search; + uint64_t end = start + size; + uint64_t rm_start, rm_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + search.ss_start = start; + search.ss_end = start; + + for (;;) { + ss = avl_find(t, &search, &where); + + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + + if (ss == NULL || ss->ss_start >= end) + break; + + rm_start = MAX(ss->ss_start, start); + rm_end = MIN(ss->ss_end, end); + + space_map_remove(sm, rm_start, rm_end - rm_start); + } +} + +/* + * Replace smd with the union of smd and sms. + */ +void +space_map_union(space_map_t *smd, space_map_t *sms) +{ + avl_tree_t *t = &sms->sm_root; + space_seg_t *ss; + + ASSERT(MUTEX_HELD(smd->sm_lock)); + + /* + * For each source segment, remove any intersections with the + * destination, then add the source segment to the destination. + */ + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { + space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); + space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); + } +} + +int +space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype, + objset_t *os, uint64_t end, uint64_t space) +{ + uint64_t *entry, *entry_map, *entry_map_end; + uint64_t bufsize, size, offset; + uint64_t mapstart = sm->sm_start; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY3U(sm->sm_space, ==, 0); + + bufsize = MIN(end, SPACE_MAP_CHUNKSIZE); + entry_map = kmem_alloc(bufsize, KM_SLEEP); + + if (maptype == SM_FREE) { + space_map_add(sm, sm->sm_start, sm->sm_size); + space = sm->sm_size - space; + } + + for (offset = 0; offset < end; offset += bufsize) { + size = MIN(end - offset, bufsize); + VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); + VERIFY(size != 0); + + dprintf("object=%llu offset=%llx size=%llx\n", + smo->smo_object, offset, size); + dmu_read(os, smo->smo_object, offset, size, entry_map); + + entry_map_end = entry_map + (size / sizeof (uint64_t)); + for (entry = entry_map; entry < entry_map_end; entry++) { + uint64_t e = *entry; + + if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + continue; + + (SM_TYPE_DECODE(e) == maptype ? + space_map_add : space_map_remove)(sm, + (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, + SM_RUN_DECODE(e) << sm->sm_shift); + } + } + VERIFY3U(sm->sm_space, ==, space); + + kmem_free(entry_map, bufsize); + + return (0); +} + +void +space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo, + uint8_t maptype, objset_t *os, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + void *cookie = NULL; + space_seg_t *ss; + uint64_t bufsize, start, size, run_len; + uint64_t *entry, *entry_map, *entry_map_end; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + if (sm->sm_space == 0) + return; + + dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", + smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), + maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), + sm->sm_space); + + bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); + bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE); + entry_map = kmem_alloc(bufsize, KM_SLEEP); + entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); + entry = entry_map; + + *entry++ = SM_DEBUG_ENCODE(1) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + size = ss->ss_end - ss->ss_start; + start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; + + if (dest) + space_map_add(dest, ss->ss_start, size); + + sm->sm_space -= size; + size >>= sm->sm_shift; + + while (size) { + run_len = MIN(size, SM_RUN_MAX); + + if (entry == entry_map_end) { + dmu_write(os, smo->smo_object, smo->smo_objsize, + bufsize, entry_map, tx); + smo->smo_objsize += bufsize; + entry = entry_map; + } + + *entry++ = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + + start += run_len; + size -= run_len; + } + kmem_free(ss, sizeof (*ss)); + } + + if (entry != entry_map) { + size = (entry - entry_map) * sizeof (uint64_t); + dmu_write(os, smo->smo_object, smo->smo_objsize, + size, entry_map, tx); + smo->smo_objsize += size; + } + + kmem_free(entry_map, bufsize); + + VERIFY3U(sm->sm_space, ==, 0); +} + +void +space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os, + dmu_tx_t *tx) +{ + uint64_t oldsize = smo->smo_objsize; + + dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx); + + smo->smo_objsize = 0; + + VERIFY3U(sm->sm_space, ==, smo->smo_alloc); + space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx); + + dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n", + smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx)); +} diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h new file mode 100644 index 0000000000..b11cd42b6d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zio.h> + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); +typedef void arc_byteswap_func_t(void *buf, size_t size); + +/* generic arc_done_func_t's which you can use */ +arc_done_func_t arc_bcopy_func; +arc_done_func_t arc_getbuf_func; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + void *b_data; +}; + +/* + * These are the flags we pass into calls to the arc + */ +#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ +#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ +#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ + +arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag); +void arc_buf_free(arc_buf_t *buf, void *tag); +int arc_buf_size(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); + +int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t arc_flags); +int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, + uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t arc_flags); +int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); + +void arc_flush(void); +void arc_tempreserve_clear(uint64_t tempreserve); +int arc_tempreserve_space(uint64_t tempreserve); + +void arc_init(void); +void arc_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h new file mode 100644 index 0000000000..0933cb977b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/bplist.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpl_entries blkptr_t's, representing + * a total of bpl_bytes physical space. + */ + uint64_t bpl_entries; + uint64_t bpl_bytes; +} bplist_phys_t; + +typedef struct bplist_q { + blkptr_t bpq_blk; + void *bpq_next; +} bplist_q_t; + +typedef struct bplist { + kmutex_t bpl_lock; + objset_t *bpl_mos; + uint64_t bpl_object; + int bpl_blockshift; + int bpl_bpshift; + bplist_q_t *bpl_queue; + bplist_phys_t *bpl_phys; + dmu_buf_t *bpl_dbuf; + dmu_buf_t *bpl_cached_dbuf; +} bplist_t; + +extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); +extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); +extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); +extern void bplist_close(bplist_t *bpl); +extern boolean_t bplist_empty(bplist_t *bpl); +extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); +extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp); +extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); +extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h new file mode 100644 index 0000000000..3cf45f5985 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -0,0 +1,302 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/arc.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define DB_BONUS_BLKID (-1ULL) +#define IN_DMU_SYNC ((blkptr_t *)-1) + +/* + * define flags for dbuf_read and friends + */ + +#define DB_RF_MUST_SUCCEED 0 +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) + +/* + * The state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->(free) + * | ^ + * | | + * +----> FILL ----+ + */ +typedef enum dbuf_states { + DB_UNCACHED, + DB_FILL, + DB_READ, + DB_CACHED +} dbuf_states_t; + +struct objset_impl; +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +#define LIST_LINK_INACTIVE(link) \ + ((link)->list_next == NULL && (link)->list_prev == NULL) + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset_impl *db_objset; + + /* + * the dnode we belong to (NULL when evicted) + */ + struct dnode *db_dnode; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + refcount_t db_holds; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + kcondvar_t db_changed; + arc_buf_t *db_data_pending; + + /* + * Last time (transaction group) this buffer was dirtied. + */ + uint64_t db_dirtied; + + /* + * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_mtx. + */ + list_node_t db_link; + + /* Our link on dn_dirty_dbufs[txg] */ + list_node_t db_dirty_node[TXG_SIZE]; + uint8_t db_dirtycnt; + + /* + * Data which is unique to data (leaf) blocks: + */ + struct { + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + void **db_user_data_ptr_ptr; + dmu_buf_evict_func_t *db_evict_func; + uint8_t db_immediate_evict; + uint8_t db_freed_in_flight; + + /* + * db_data_old[txg&TXG_MASK] is set when we + * dirty the buffer, so that we can retain the + * pointer even if it gets COW'd in a subsequent + * transaction group. + * + * If the buffer is dirty in any txg, it can't + * be destroyed. + */ + /* + * XXX Protected by db_mtx and dn_dirty_mtx. + * db_mtx must be held to read db_dirty[], and + * both db_mtx and dn_dirty_mtx must be held to + * modify (dirty or clean). db_mtx must be held + * before dn_dirty_mtx. + */ + arc_buf_t *db_data_old[TXG_SIZE]; + blkptr_t *db_overridden_by[TXG_SIZE]; + } db_d; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 256 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + + +uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); + +dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, uint64_t blkid); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db); + +dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); + +void dbuf_read(dmu_buf_impl_t *db); +int dbuf_read_canfail(dmu_buf_impl_t *db); +void dbuf_read_havestruct(dmu_buf_impl_t *db); +void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); + +void dbuf_evict(dmu_buf_impl_t *db); + +void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx); +void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg); + +void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, + struct dmu_tx *); + +void dbuf_downgrade(dmu_buf_impl_t *db, int evicting); +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_init(void); +void dbuf_fini(void); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __blkbuf[200]; \ + sprintf_blkptr(__blkbuf, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h new file mode 100644 index 0000000000..f51ab89a90 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -0,0 +1,635 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include <sys/inttypes.h> +#include <sys/types.h> +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct uio; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPLIST, /* UINT64 */ + DMU_OT_BPLIST_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DATASET, /* UINT64 */ + DMU_OT_DSL_DATASET_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_OBJSET_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_OBJSET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_ACL, /* ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_DELETE_QUEUE, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + + DMU_OT_NUMTYPES +} dmu_object_type_t; + +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_MODE_NONE 0 /* invalid, to aid debugging */ +#define DS_MODE_STANDARD 1 /* normal access, no special needs */ +#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */ +#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */ +#define DS_MODE_LEVELS 4 +#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1)) +#define DS_MODE_READONLY 0x8 +#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) +#define DS_MODE_RESTORE 0x10 +#define DS_MODE_IS_RESTORE(x) ((x) & DS_MODE_RESTORE) + +#define DS_FIND_SNAPSHOTS 0x01 + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (10<<20) /* 10MB */ + +/* + * Public routines to create, destroy, open, and close objsets. + */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_objset_rollback(const char *name); +int dmu_objset_rename(const char *name, const char *newname); +void dmu_objset_set_quota(objset_t *os, uint64_t quota); +uint64_t dmu_objset_get_quota(objset_t *os); +int dmu_objset_request_reservation(objset_t *os, uint64_t reservation); +void dmu_objset_find(char *name, void func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); + +/* + * Callback function to perform byte swapping on a block. + */ +typedef void dmu_byteswap_func_t(void *buf, size_t size); + +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPLIST "sync_bplist" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allcated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_read() to + * read db_data, dmu_buf_will_dirty() before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release your hold with dmu_buf_rele(). + */ +dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object); +dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag); +int dmu_bonus_max(void); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your + * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset); +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); +void dmu_buf_remove_ref(dmu_buf_t *db, void* tag); +void dmu_buf_rele(dmu_buf_t *db); +void dmu_buf_rele_tag(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length, int *numbufs); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs); + +/* + * Returns NULL on success, or the existing user ptr if it's already + * been set. + * + * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * + * user_data_ptr_ptr should be NULL, or a pointer to a pointer which + * will be set to db->db_data when you are allowed to access it. Note + * that db->db_data (the pointer) can change when you do dmu_buf_read(), + * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). + * *user_data_ptr_ptr will be set to the new value when it changes. + * + * If non-NULL, pageout func will be called when this buffer is being + * excised from the cache, so that you can clean up the data structure + * pointed to by user_ptr. + * + * dmu_evict_user() will call the pageout func for all buffers in a + * objset with a given pageout func. + */ +void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +/* + * set_user_ie is the same as set_user, but request immediate eviction + * when hold count goes to zero. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, + void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, + void *user_ptr, void *user_data_ptr_ptr, + dmu_buf_evict_func_t *pageout_func); +void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); + +void dmu_buf_hold_data(dmu_buf_t *db); +void dmu_buf_rele_data(dmu_buf_t *db); + +/* + * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +/* + * Indicate that you are going to read the buffer's data (db_data). + * + * This routine will read the data from disk if necessary. + * + * These routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +void dmu_buf_read(dmu_buf_t *db); +int dmu_buf_read_canfail(dmu_buf_t *db); +void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs); +int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); + +/* + * Indicate that you are going to modify the entire contents of the + * buffer's data ("fill" it). + * + * This routine is the same as dmu_buf_will_dirty, except that it won't + * read the contents off the disk, so the contents may be uninitialized + * and you must overwrite it. + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx (ie. + * you've called dmu_tx_hold_object(tx, db->db_object)). + */ +/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */ + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); + +/* + * Free up the data blocks for a defined range of a file. If size is + * zero, the range from offset to end-of-file is freed. + */ +void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf); +int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset, + uint64_t size, void *buf); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + struct uio *uio, dmu_tx_t *tx); + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len); + +typedef struct dmu_object_info { + /* All sizes are in bytes. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + uint64_t doi_bonus_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_pad[5]; + /* Values below are number of 512-byte blocks. */ + uint64_t doi_physical_blks; /* data + metadata */ + uint64_t doi_max_block_offset; +} dmu_object_info_t; + +typedef struct dmu_object_type_info { + dmu_byteswap_func_t *ot_byteswap; + boolean_t ot_metadata; + char *ot_name; +} dmu_object_type_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +/* + * Get the maximum nonzero offset in the object (ie. this offset and all + * offsets following are zero). + * + * XXX Perhaps integrate this with dmu_object_info(), although that + * would then have to bring in the indirect blocks. + */ +uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object); + +typedef struct dmu_objset_stats { + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_is_placeholder; + uint8_t dds_pad[2]; + + uint64_t dds_creation_time; + uint64_t dds_creation_txg; + + char dds_clone_of[MAXNAMELEN]; + + /* How much data is there in this objset? */ + + /* + * Space referenced, taking into account pending writes and + * frees. Only relavent to filesystems and snapshots (not + * collections). + */ + uint64_t dds_space_refd; + + /* + * Space "used", taking into account pending writes and frees, and + * children's reservations (in bytes). This is the amount of + * space that will be freed if this and all dependent items are + * destroyed (eg. child datasets, objsets, and snapshots). So + * for snapshots, this is the amount of space unique to this + * snapshot. + */ + uint64_t dds_space_used; + + /* + * Compressed and uncompressed bytes consumed. Does not take + * into account reservations. Used for computing compression + * ratio. + */ + uint64_t dds_compressed_bytes; + uint64_t dds_uncompressed_bytes; + + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t dds_fsid_guid; + uint64_t dds_guid; + + uint64_t dds_objects_used; /* number of objects used */ + uint64_t dds_objects_avail; /* number of objects available */ + + uint64_t dds_num_clones; /* number of clones of this */ + + /* The dataset's administratively-set quota, in bytes. */ + uint64_t dds_quota; + + /* The dataset's administratively-set reservation, in bytes */ + uint64_t dds_reserved; + + /* + * The amount of additional space that this dataset can consume. + * Takes into account quotas & reservations. + * (Assuming that no other datasets consume it first.) + */ + uint64_t dds_available; + + /* + * Various properties. + */ + uint64_t dds_compression; + uint64_t dds_checksum; + uint64_t dds_zoned; + char dds_compression_setpoint[MAXNAMELEN]; + char dds_checksum_setpoint[MAXNAMELEN]; + char dds_zoned_setpoint[MAXNAMELEN]; + char dds_altroot[MAXPATHLEN]; + + /* The following are for debugging purposes only */ + uint64_t dds_last_txg; + uint64_t dds_dir_obj; + uint64_t dds_objset_obj; + uint64_t dds_clone_of_obj; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* XXX */ + +/* + * Synchronous write. + * On success returns 0 and fills in the blk pointed at by bp. + * Note that while the data covered by this function will be on stable + * storage when the function returns this new data does not become a + * permanent part of the file until the associated transaction commits. + */ +int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, + struct blkptr *bp, uint64_t txg); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); + +int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp); +int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, + struct vnode *vp, uint64_t voffset); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h new file mode 100644 index 0000000000..b6e8b62ec2 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/txg_impl.h> +#include <sys/zio.h> +#include <sys/dnode.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dmu_tx_count_free: + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dd_assigned_tx + * dn_notxholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock (leaf) + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * dsl_dataset_block_freeable: none (dd_sync_*) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock (leaf) + * protects: + * ds_user_ptr + * ds_user_evice_func + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * held from: + * dsl_dataset_* + * + */ + +struct objset; +struct dmu_pool; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h new file mode 100644 index 0000000000..d0a77fcfb9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/zfs_context.h> +#include <sys/dnode.h> +#include <sys/zio.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dmu_tx; +struct objset_impl; + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - + sizeof (uint64_t)]; +} objset_phys_t; + +struct objset { + struct objset_impl *os; + int os_mode; +}; + +typedef struct objset_impl { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + objset_phys_t *os_phys; + dnode_t *os_meta_dnode; + zilog_t *os_zil; + objset_t os; + uint8_t os_checksum; /* can change, under dsl_dir's locks */ + uint8_t os_compress; /* can change, under dsl_dir's locks */ + uint8_t os_md_checksum; + uint8_t os_md_compress; + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + blkptr_t os_rootbp; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next; + + /* Protected by os_lock */ + kmutex_t os_lock; + list_t os_dirty_dnodes[TXG_SIZE]; + list_t os_free_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; +} objset_impl_t; + +#define DMU_PRIVATE_OBJECT (1ULL << 63) + +#define DMU_META_DNODE_OBJECT (1ULL << 63) + +/* XXX rename this to DMU_IS_DNODE_OBJECT? */ +#define IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT) + +/* called from zpl */ +int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp); +void dmu_objset_close(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, + objset_t *clone_parent, + void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); +int dmu_objset_destroy(const char *name); +int dmu_objset_rollback(const char *name); +void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds); +void dmu_objset_find(char *name, void func(char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); + +/* called from dsl */ +void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx); +objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + dmu_objset_type_t type, dmu_tx_t *tx); +objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp); +void dmu_objset_evict(struct dsl_dataset *ds, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h new file mode 100644 index 0000000000..7087912e00 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/arc.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ADVANCE_POST 0 /* post-order traversal */ +#define ADVANCE_PRE 0x01 /* pre-order traversal */ +#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */ +#define ADVANCE_DATA 0x04 /* read user data blocks */ +#define ADVANCE_HOLES 0x08 /* visit holes */ +#define ADVANCE_NOLOCK 0x10 /* Don't grab SPA sync lock */ + +#define ZB_NO_LEVEL -2 +#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */ +#define ZB_MAXBLKID (1ULL << 62) +#define ZB_MAXOBJSET (1ULL << 62) +#define ZB_MAXOBJECT (1ULL << 62) + +#define ZB_MOS_CACHE 0 +#define ZB_MDN_CACHE 1 +#define ZB_DN_CACHE 2 +#define ZB_DEPTH 3 + +typedef struct zbookmark { + uint64_t zb_objset; + uint64_t zb_object; + int zb_level; + uint64_t zb_blkid; +} zbookmark_t; + +typedef struct zseg { + uint64_t seg_mintxg; + uint64_t seg_maxtxg; + zbookmark_t seg_start; + zbookmark_t seg_end; + list_node_t seg_node; +} zseg_t; + +typedef struct traverse_blk_cache { + zbookmark_t bc_bookmark; + blkptr_t bc_blkptr; + void *bc_data; + dnode_phys_t *bc_dnode; + int bc_errno; + int bc_pad1; + uint64_t bc_pad2; +} traverse_blk_cache_t; + +typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg); + +struct traverse_handle { + spa_t *th_spa; + blkptr_cb_t *th_func; + void *th_arg; + int th_advance; + int th_zio_flags; + list_t th_seglist; + traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL]; + uint64_t th_hits; + uint64_t th_arc_hits; + uint64_t th_reads; + uint64_t th_callbacks; + uint64_t th_syncs; + uint64_t th_restarts; + zbookmark_t th_noread; + zbookmark_t th_lastcb; +}; + +int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int advance, blkptr_cb_t func, void *arg); + +traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg, + int advance, int zio_flags); +void traverse_fini(traverse_handle_t *th); + +void traverse_add_dnode(traverse_handle_t *th, + uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object); +void traverse_add_objset(traverse_handle_t *th, + uint64_t mintxg, uint64_t maxtxg, uint64_t objset); +void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg); + +int traverse_more(traverse_handle_t *th); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h new file mode 100644 index 0000000000..5d2f1127ce --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + txg_handle_t tx_txgh; + uint64_t tx_space_towrite; + refcount_t tx_space_written; + uint64_t tx_space_tofree; + refcount_t tx_space_freed; + uint64_t tx_space_tooverwrite; + void *tx_tempreserve_cookie; + uint8_t tx_anyobj; + uint8_t tx_privateobj; +#ifdef ZFS_DEBUG + char *tx_debug_buf; + int tx_debug_len; +#endif +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_NUMTYPES +}; + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + + +typedef struct dmu_tx_hold { + list_node_t dth_node; + struct dnode *dth_dnode; + enum dmu_tx_hold_type dth_type; + dmu_tx_hold_func_t dth_func; + uint64_t dth_arg1; + uint64_t dth_arg2; + /* XXX track what the actual estimates were for this hold */ +} dmu_tx_hold_t; + + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_ds(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG + +extern int dmu_use_tx_debug_bufs; + +#define dprintf_tx(tx, fmt, ...) \ + if (dmu_use_tx_debug_bufs) \ + do { \ + char *__bufp; \ + int __len; \ + if (tx->tx_debug_buf == NULL) { \ + __bufp = kmem_zalloc(4096, KM_SLEEP); \ + tx->tx_debug_buf = __bufp; \ + tx->tx_debug_len = __len = 4096; \ + } else { \ + __len = tx->tx_debug_len; \ + __bufp = &tx->tx_debug_buf[4096-__len]; \ + } \ + tx->tx_debug_len -= snprintf(__bufp, __len, fmt, __VA_ARGS__); \ +_NOTE(CONSTCOND) } while (0); \ + else dprintf(fmt, __VA_ARGS__) + +#else + +#define dprintf_tx(tx, fmt, ...) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h new file mode 100644 index 0000000000..35466d6874 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DFETCH_H +#define _DFETCH_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint64_t zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef enum zfetch_dirn { + ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ + ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ +} zfetch_dirn_t; + +typedef struct zstream { + uint64_t zst_offset; /* offset of starting block in range */ + uint64_t zst_len; /* length of range, in blocks */ + zfetch_dirn_t zst_direction; /* direction of prefetch */ + uint64_t zst_stride; /* length of stride, in blocks */ + uint64_t zst_ph_offset; /* prefetch offset, in blocks */ + uint64_t zst_cap; /* prefetch limit (cap), in blocks */ + kmutex_t zst_lock; /* protects stream */ + clock_t zst_last; /* lbolt of last prefetch */ + avl_node_t zst_node; /* embed avl node here */ +} zstream_t; + +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* AVL tree of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + uint32_t zf_stream_cnt; /* # of active streams */ + uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ +} zfetch_t; + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_rele(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DFETCH_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h new file mode 100644 index 0000000000..2a5ef92b52 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -0,0 +1,301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/refcount.h> +#include <sys/dmu_zfetch.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 10 /* 1k */ +#define DN_MAX_INDBLKSHIFT 14 /* 16k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * Derived constants. + */ +#define DNODE_SIZE (1 << DNODE_SHIFT) +#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) + +#define DN_META_DNODE_LEVELS \ + (1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT - \ + DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT) + +/* The +2 here is a cheesy way to round up */ +#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ + (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) + +#define DN_MAX_OBJECT \ + ((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT + \ + (DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT)) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset_impl; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_pad1[1]; + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_pad2[4]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_secphys; /* 512b sectors of disk space used */ + + uint64_t dn_pad3[4]; + + blkptr_t dn_blkptr[1]; + uint8_t dn_bonus[DN_MAX_BONUSLEN]; +} dnode_phys_t; + +typedef struct dnode { + /* + * lock ordering: + * + * db_mtx > dn_dirty_mtx + * dbuf_syncdone + * + * dn_struct_rwlock/r > dn_dirty_mtx + * dmu_object_info + * + * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx + * dbuf_dirty + * dbuf_setdirty + * + * dn_struct_rwlock/w > db_mtx > dn_mtx + * dnode_increase_indirection -> dbuf_find + * dbuf_hold_impl + * dnode_set_bonus + * + * dn_struct_rwlock/w > dn_mtx + * dnode_increase_indirection + * + * dn_dirty_mtx > dn_mtx + * dnode_buf_pageout + * + * db_mtx > dn_mtx + * dbuf_create + */ + + /* + * dn_struct_rwlock protects the structure of the dnode. + * In particular, it protects the number of levels of indirection. + */ + krwlock_t dn_struct_rwlock; + + /* + * Our link on dataset's dd_dnodes list. + * Protected by dd_accounting_mtx. + */ + list_node_t dn_link; + + /* immutable: */ + struct objset_impl *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid here even before + * the dnode is first synced. + */ + dmu_object_type_t dn_type; /* object type (immutable) */ + uint8_t dn_bonustype; /* bonus type (immutable) */ + uint16_t dn_bonuslen; /* bonus length (immutable) */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint32_t dn_datablksz; /* in bytes */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + + /* + * The following are kept up-to-date in the *open* context, the syncing + * context should only pay attention to the dn_next_* values. + */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + + /* protected by os_lock: */ + uint32_t dn_dirtyblksz[TXG_SIZE]; /* dirty block size in bytes */ + list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_dbufs[TXG_SIZE]; + uint64_t dn_maxblkid; + avl_tree_t dn_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + struct dmu_tx *dn_assigned_tx; /* if only one tx cares */ + kcondvar_t dn_notxholds; + enum dnode_dirtycontext dn_dirtyctx; + uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + refcount_t dn_tx_holds; + refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + kcondvar_t dn_evicted; /* a child dbuf has been evicted */ + + /* + * Performance hack: whenever we have a hold on the bonus buffer of a + * ZAP object, we will also have a hold on db0. This will keep the + * meta-data for a micro-zap object cached as long as the znode for the + * object is in the znode cache. + */ + struct dmu_buf_impl *dn_db0; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +} dnode_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, + uint64_t object); +void dnode_special_close(dnode_t *dn); + +dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref); +dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, + void *ref); +void dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +uint64_t dnode_current_max_length(dnode_t *dn); +uint64_t dnode_max_nonzero_offset(dnode_t *dn); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_clear_range(dnode_t *dn, uint64_t blkid, + uint64_t nblks, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, + uint64_t blkfill); + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but that piece of shit + * gcc doesn't support that preprocessor token. + */ +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#else + +#define dprintf_dnode(db, fmt, ...) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h new file mode 100644 index 0000000000..e56c8a67d9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/bplist.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; + +typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; + uint64_t ds_prev_snap_obj; + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; + uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; + uint64_t ds_used_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relavent to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_restoring; /* boolean */ + blkptr_t ds_bp; + uint64_t ds_pad[8]; /* pad out to 256 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + /* Immutable: */ + struct dsl_dir *ds_dir; + dsl_dataset_phys_t *ds_phys; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + + /* only used in syncing context: */ + struct dsl_dataset *ds_prev; /* only valid for non-snapshots */ + + /* has internal locking: */ + bplist_t ds_deadlist; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_<accounting> is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + void *ds_user_ptr; + dsl_dataset_evict_func_t *ds_user_evict_func; + uint64_t ds_open_refcount; + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[MAXNAMELEN]; +} dsl_dataset_t; + +#define dsl_dataset_is_snapshot(ds) \ + ((ds)->ds_phys->ds_num_children != 0) + +int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, + void *tag, dsl_dataset_t **dsp); +int dsl_dataset_open(const char *name, int mode, void *tag, + dsl_dataset_t **dsp); +dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, + const char *tail, int mode, void *tag); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag); +int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname, + const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx); +int dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx); +int dsl_dataset_destroy(const char *name); +int dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx); +int dsl_dataset_rollback(const char *name); +int dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx); +int dsl_dataset_rename(const char *name, const char *newname); + +void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, + void *p, dsl_dataset_evict_func_t func); +void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); + +void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp); +void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, + dmu_tx_t *tx); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); +void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds); +struct dsl_pool *dsl_dataset_pool(dsl_dataset_t *ds); + +void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp, + dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h new file mode 100644 index 0000000000..0499d731e6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/refcount.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_clone_parent_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + /* These are immutable; no lock needed: */ + uint64_t dd_object; + dsl_dir_phys_t *dd_phys; + dmu_buf_t *dd_dbuf; + dsl_pool_t *dd_pool; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + /* Thing to do when we sync */ + uint64_t dd_sync_txg; + int (*dd_sync_func)(dsl_dir_t *dd, void *arg, dmu_tx_t *tx); + void *dd_sync_arg; + int dd_sync_err; + + /* Accounting */ + /* reflects any changes to dd_phys->dd_used_bytes made this syncing */ + int64_t dd_used_bytes; + /* int64_t dd_compressed_bytes; */ + /* int64_t dd_uncompressed_bytes; */ + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[MAXNAMELEN]; +}; + +void dsl_dir_close(dsl_dir_t *dd, void *tag); +dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail); +dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, + const char **tailp); +dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_is_private(dsl_dir_t *dd); +int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx); +void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx); +int dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx); +void dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +int dsl_dir_sync_task(dsl_dir_t *dd, + int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space); +int dsl_dir_set_quota(const char *ddname, uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, MAXNAMELEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h new file mode 100644 index 0000000000..4fca4548ad --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/txg_impl.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + uint64_t dp_root_dir_obj; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + list_t dp_synced_objsets; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_dirs; + + /* + * Protects administrative changes (properties, namespace) + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + krwlock_t dp_config_rwlock; +} dsl_pool_t; + +dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_zil_clean(dsl_pool_t *dp); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h new file mode 100644 index 0000000000..ea810b03ab --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +#define DSL_PROP_VALUE_UNDEFINED (-1ULL) + +typedef struct dsl_prop_cb_record { + list_node_t cbr_node; /* link on dd_prop_cbs */ + const char *cbr_propname; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_string(const char *ddname, const char *propname, + char *value, int valuelen, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname, + uint64_t *valuep, char *setpoint); + +int dsl_prop_set(const char *ddname, const char *propname, + int intsz, int numints, const void *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h new file mode 100644 index 0000000000..e592b388fd --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/space_map.h> +#include <sys/txg.h> +#include <sys/zio.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct metaslab_class metaslab_class_t; +typedef struct metaslab_group metaslab_group_t; + +extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, + metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg); +extern void metaslab_fini(metaslab_t *msp); +extern void metaslab_sync(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); + +extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg); +extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg); +extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); + +extern metaslab_class_t *metaslab_class_create(void); +extern void metaslab_class_destroy(metaslab_class_t *mc); +extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); +extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); + +extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, + vdev_t *vd); +extern void metaslab_group_destroy(metaslab_group_t *mg); +extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight); +extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp); +extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h new file mode 100644 index 0000000000..5b1e388727 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/metaslab.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/txg.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct metaslab_class { + metaslab_group_t *mc_rotor; + uint64_t mc_allocated; +}; + +struct metaslab_group { + kmutex_t mg_lock; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + int64_t mg_bias; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; +}; + +/* + * Each metaslab's free block list is kept in its own DMU object in the + * metaslab freelist dataset. To minimize space consumption, the list + * is circular. + * + * Allocations and frees can happen in multiple transaction groups at + * the same time, which makes it a bit challening to keep the metaslab + * consistent. For example, we cannot allow frees from different + * transaction groups to be interleaved in the metaslab's free block list. + * + * We address this in several ways: + * + * We don't allow allocations from the same metaslab in concurrent + * transaction groups. metaslab_alloc() enforces this by checking + * the ms_last_alloc field, which specifies the last txg in which + * the metaslab was used for allocations. + * + * We can't segregate frees this way because we can't choose which + * DVAs someone wants to free. So we keep separate in-core freelists + * for each active transaction group. This in-core data is only + * written to the metaslab's on-disk freelist in metaslab_sync(), + * which solves the interleave problem: we only append frees from + * the syncing txg to the on-disk freelist, so the appends all occur + * in txg order. + * + * We cannot allow a block which was freed in a given txg to be + * allocated again until that txg has closed; otherwise, if we + * failed to sync that txg and had to roll back to txg - 1, + * changes in txg + 1 could have overwritten the data. Therefore, + * we partition the free blocks into "available" and "limbo" states. + * A block is available if the txg in which it was freed has closed; + * until then, the block is in limbo. Each time metaslab_sync() runs, + * if first adds any limbo blocks to the avail list, clears the limbo + * list, and starts writing the new limbo blocks (i.e. the ones that + * were freed in the syncing txg). + */ + +struct metaslab { + kmutex_t ms_lock; /* metaslab lock */ + space_map_obj_t *ms_smo; /* space map object */ + uint64_t ms_last_alloc; /* txg of last alloc */ + uint64_t ms_usable_end; /* end of free_obj at last sync */ + uint64_t ms_usable_space; /* usable space at last sync */ + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + uint64_t ms_weight; /* weight vs. others in group */ + uint8_t ms_dirty[TXG_SIZE]; /* per-txg dirty flags */ + space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + space_map_t ms_map; /* in-core free space map */ + uint8_t ms_map_incore; /* space map contents are valid */ + uint64_t ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD */ +}; + +/* + * ms_dirty[] flags + */ +#define MSD_ALLOC 0x01 /* allocated from in this txg */ +#define MSD_FREE 0x02 /* freed to in this txg */ +#define MSD_ADD 0x04 /* added to the pool in this txg */ +#define MSD_CONDENSE 0x08 /* condensed in this txg */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h new file mode 100644 index 0000000000..f9fffd2443 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_REFCOUNT_H +#define _SYS_REFCOUNT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/list.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((void*)__func__) + +#if defined(DEBUG) || !defined(_KERNEL) +typedef struct reference { + list_node_t ref_link; + void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + list_t rc_list; + list_t rc_removed; + int64_t rc_count; + int64_t rc_removed_count; +} refcount_t; + +/* Note: refcount_t should be initialized to zero before use. */ + +void refcount_create(refcount_t *rc); +void refcount_destroy(refcount_t *rc); +void refcount_destroy_many(refcount_t *rc, uint64_t number); +int refcount_is_zero(refcount_t *rc); +int64_t refcount_count(refcount_t *rc); +int64_t refcount_add(refcount_t *rc, void *holder_tag); +int64_t refcount_remove(refcount_t *rc, void *holder_tag); +int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); +int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); + +void refcount_init(void); +void refcount_fini(void); + +#else /* DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} refcount_t; + +#define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_destroy(rc) ((rc)->rc_count = 0) +#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define refcount_is_zero(rc) ((rc)->rc_count == 0) +#define refcount_count(rc) ((rc)->rc_count) +#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) +#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) +#define refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) + +#define refcount_init() +#define refcount_fini() + +#endif /* DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h new file mode 100644 index 0000000000..9bf0f89d49 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -0,0 +1,406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/zfs_context.h> +#include <sys/nvpair.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct zilog zilog_t; +typedef struct traverse_handle traverse_handle_t; +struct dsl_pool; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) \ + ((x) ^= BF32_ENCODE((x >> low) ^ val, low, len)) +#define BF64_SET(x, low, len, val) \ + ((x) ^= BF64_ENCODE((x >> low) ^ val, low, len)) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF64_SET_SB(x, low, len, shift, bias, val) \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) + +/* + * We currently support nine block sizes, from 512 bytes to 128K. + * We could go higher, but the benefits are near-zero and the cost + * of COWing a giant block to modify one byte would become excessive. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 17 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * E endianness + * type DMU object type + * lvl level of indirection + * birth txg transaction group in which the block was born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ +typedef struct blkptr { + dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[3]; /* Extra space for the future */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) +#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_HOLE(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_PSIZE(bp) \ + BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define BP_SET_PSIZE(bp, x) \ + BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) + +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_ASIZE(bp) \ + (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) + +#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_pad[2] = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +/* + * Note: the byteorder is either 0 or -1, both of which are palindromes. + * This simplifies the endianness handling a bit. + */ +#ifdef _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (-1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#include <sys/dmu.h> + +/* + * Routines found in spa.c + */ + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_get_stats(const char *pool, nvlist_t **config); +extern int spa_create(const char *pool, nvlist_t *config, char *altroot); +extern int spa_import(const char *pool, nvlist_t *config, char *altroot); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_export(char *pool); + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, + int replace_done); +extern void spa_vdev_replace_done(spa_t *spa); + +/* scrubbing */ +extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); +extern void spa_scrub_suspend(spa_t *spa); +extern void spa_scrub_resume(spa_t *spa); +extern void spa_scrub_restart(spa_t *spa, uint64_t txg); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +/* + * SPA configuration functions in spa_config.c + */ +extern void spa_config_sync(void); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +/* Pool configuration lock */ +extern void spa_config_enter(spa_t *spa, krw_t rw); +extern void spa_config_exit(spa_t *spa); +extern boolean_t spa_config_held(spa_t *spa, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Accessor functions */ +extern krwlock_t *spa_traverse_rwlock(spa_t *spa); +extern int spa_traverse_wanted(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern int spa_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +struct metaslab_class; +extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa); +extern uint64_t spa_get_alloc(spa_t *spa); +extern uint64_t spa_get_space(spa_t *spa); +extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern int spa_busy(void); + +/* Miscellaneous support routines */ +extern int spa_rename(const char *oldname, const char *newname); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern void sprintf_blkptr(char *buf, blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern void spa_evict_all(void); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __blkbuf[200]; \ + sprintf_blkptr(__blkbuf, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h new file mode 100644 index 0000000000..0fcef6c48b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/metaslab.h> +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/uberblock_impl.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/refcount.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_config_lock { + kmutex_t scl_lock; + uint64_t scl_count; + kthread_t *scl_writer; + kcondvar_t scl_cv; +} spa_config_lock_t; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char *spa_name; + avl_node_t spa_avl; + int spa_anon; + nvlist_t *spa_config; + uint64_t spa_config_txg; /* txg of last config change */ + spa_config_lock_t spa_config_lock; /* configuration changes */ + kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */ + int spa_sync_pass; /* iterate-to-convergence */ + int spa_state; /* pool state */ + uint8_t spa_minref; /* min refcnt of open pool */ + uint8_t spa_traverse_wanted; /* traverse lock wanted */ + taskq_t *spa_vdev_retry_taskq; + taskq_t *spa_zio_issue_taskq[ZIO_TYPES]; + taskq_t *spa_zio_intr_taskq[ZIO_TYPES]; + dsl_pool_t *spa_dsl_pool; + metaslab_class_t *spa_normal_class; /* normal data class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + list_t spa_dirty_list; /* vdevs with dirty labels */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + uint64_t spa_sync_bplist_obj; /* object for deferred frees */ + bplist_t spa_sync_bplist; /* deferred-free bplist */ + krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */ + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + kthread_t *spa_scrub_thread; /* scrub/resilver thread */ + traverse_handle_t *spa_scrub_th; /* scrub traverse handle */ + uint64_t spa_scrub_restart_txg; /* need to restart */ + uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */ + uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_errors; /* scrub I/O error count */ + kcondvar_t spa_scrub_cv; /* scrub thread state change */ + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_stop; /* tell scrubber to stop */ + uint8_t spa_scrub_suspend; /* tell scrubber to suspend */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + int spa_sync_on; /* sync threads are running */ + char *spa_root; /* alternate root directory */ + kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */ + /* + * spa_refcnt must be the last element because it changes size based on + * compilation options. In order for the MDB module to function + * correctly, the other fields must remain in the same location. + */ + refcount_t spa_refcount; /* number of opens */ +}; + +extern const char *spa_config_dir; +extern kmutex_t spa_namespace_lock; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h new file mode 100644 index 0000000000..9f0cf83c9a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/space_map.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_map { + avl_tree_t sm_root; /* Root of the AVL tree */ + uint64_t sm_start; /* Start of map (inclusive) */ + uint64_t sm_end; /* End of map (exclusive) */ + uint64_t sm_size; /* Size of map (end - start) */ + uint64_t sm_shift; /* Unit shift */ + uint64_t sm_space; /* Sum of all segments in the map */ + kmutex_t *sm_lock; /* pointer to lock that protects map */ +} space_map_t; + +typedef struct space_seg { + avl_node_t ss_node; /* AVL node */ + uint64_t ss_start; /* starting offset of this segment */ + uint64_t ss_end; /* ending offset (non-inclusive) */ +} space_seg_t; + +typedef struct space_map_obj { + uint64_t smo_object; /* on-disk space map object */ + uint64_t smo_objsize; /* size of the object */ + uint64_t smo_alloc; /* space allocated from the map */ +} space_map_obj_t; + +/* + * debug entry + * + * 1 3 10 50 + * ,---+--------+------------+---------------------------------. + * | 1 | action | syncpass | txg (lower bits) | + * `---+--------+------------+---------------------------------' + * 63 62 60 59 50 49 0 + * + * + * + * non-debug entry + * + * 1 47 1 15 + * ,-----------------------------------------------------------. + * | 0 | offset (sm_shift units) | type | run | + * `-----------------------------------------------------------' + * 63 62 17 16 15 0 + */ + +/* All this stuff takes and returns bytes */ +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) +#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) +#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) + +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) + +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) + +#define SM_ALLOC 0x0 +#define SM_FREE 0x1 + +/* + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer i/o operations, but they also cause the + * DMU to keep more data in-core, and also to waste more i/o bandwidth + * when only a few blocks have changed since the last transaction group. + * This could use a lot more research, but for now, set the freelist + * block size to 4k (2^12). + */ +#define SPACE_MAP_BLOCKSHIFT 12 + +#define SPACE_MAP_CHUNKSIZE (1<<20) + +typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); + +extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, + uint64_t shift, kmutex_t *lp); +extern void space_map_destroy(space_map_t *sm); +extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); +extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_vacate(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_iterate(space_map_t *sm, + space_map_func_t *func, space_map_t *mdest); +extern void space_map_merge(space_map_t *dest, space_map_t *src); +extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_union(space_map_t *smd, space_map_t *sms); + +extern int space_map_load(space_map_t *sm, space_map_obj_t *smo, + uint8_t maptype, objset_t *os, uint64_t end, uint64_t space); +extern void space_map_sync(space_map_t *sm, space_map_t *dest, + space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx); +extern void space_map_write(space_map_t *sm, space_map_obj_t *smo, + objset_t *os, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h new file mode 100644 index 0000000000..dae129c2e5 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/txg.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +#define TXG_WAIT 1ULL +#define TXG_NOWAIT 2ULL + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_suspend(struct dsl_pool *dp); +extern void txg_resume(struct dsl_pool *dp); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately). + * If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern int txg_stalled(struct dsl_pool *dp); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern int txg_list_empty(txg_list_t *tl, uint64_t txg); +extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h new file mode 100644 index 0000000000..45a138afaa --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/txg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct tx_cpu { + kmutex_t tc_lock; + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; + char tc_pad[16]; +}; + +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects right to enter txg */ + kmutex_t tx_sync_lock; /* protects tx_state_t */ + krwlock_t tx_suspend; + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_exit_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; + kthread_t *tx_timelimit_thread; +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock.h b/usr/src/uts/common/fs/zfs/sys/uberblock.h new file mode 100644 index 0000000000..93d936ae4b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *ub); +extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h new file mode 100644 index 0000000000..5bfcea097d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/uberblock.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ + +#define UBERBLOCK_SHIFT (10) +#define UBERBLOCK_SIZE (1ULL << UBERBLOCK_SHIFT) + +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ + +#define UBERBLOCK_VERSION 1ULL + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* UBERBLOCK_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ +}; + +typedef struct uberblock_phys { + uberblock_t ubp_uberblock; + char ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) - + sizeof (zio_block_tail_t)]; + zio_block_tail_t ubp_zbt; +} uberblock_phys_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h new file mode 100644 index 0000000000..c8c177e3ca --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/unique.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); + +/* Return a new unique value. */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h new file mode 100644 index 0000000000..4113ff2ca6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/space_map.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Vdev knobs. + */ +typedef struct vdev_knob { + char *vk_name; /* knob name */ + char *vk_desc; /* knob description */ + uint64_t vk_min; /* minimum legal value */ + uint64_t vk_max; /* maximum legal value */ + uint64_t vk_default; /* default value */ + size_t vk_offset; /* offset into vdev_t */ +} vdev_knob_t; + +/* + * Fault injection modes. + */ +#define VDEV_FAULT_NONE 0 +#define VDEV_FAULT_RANDOM 1 +#define VDEV_FAULT_COUNT 2 + +extern int vdev_open(vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg); +extern void vdev_init(vdev_t *, uint64_t txg); +extern void vdev_reopen(vdev_t *, zio_t **zq); + +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); +extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + int scrub_done); + +extern const char *vdev_description(vdev_t *vd); + +extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); + +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_stat_update(zio_t *zio); +extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, + boolean_t complete); +extern void vdev_checksum_error(zio_t *zio, vdev_t *vd); +extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux); + +extern void vdev_space_update(vdev_t *vd, uint64_t space_delta, + uint64_t alloc_delta); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern void vdev_io_start(zio_t *zio); +extern void vdev_io_done(zio_t *zio); + +extern int vdev_online(spa_t *spa, const char *path); +extern int vdev_offline(spa_t *spa, const char *path); + +extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, + uint64_t arg); +extern int vdev_error_inject(vdev_t *vd, zio_t *zio); +extern int vdev_is_dead(vdev_t *vd); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern int vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); + +extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); + +extern nvlist_t *vdev_config_generate(vdev_t *vd, int getstats); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd); +extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); +int vdev_label_init(vdev_t *vd, uint64_t create_txg); +extern int spa_sync_labels(spa_t *spa, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h new file mode 100644 index 0000000000..95536a77db --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> +#ifdef _KERNEL +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; +} vdev_disk_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_file.h b/usr/src/uts/common/fs/zfs/sys/vdev_file.h new file mode 100644 index 0000000000..cd49673577 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_file.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/vdev.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + vnode_t *vf_vnode; +} vdev_file_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h new file mode 100644 index 0000000000..4ae3467619 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/avl.h> +#include <sys/dmu.h> +#include <sys/metaslab.h> +#include <sys/nvpair.h> +#include <sys/space_map.h> +#include <sys/vdev.h> +#include <sys/dkio.h> +#include <sys/uberblock_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef void vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); + +typedef struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + char *ve_data; + uint64_t ve_offset; + uint64_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + uint64_t vc_size; + uint64_t vc_bshift; + uint64_t vc_blocksize; + uint64_t vc_max; + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +struct vdev_queue { + uint64_t vq_min_pending; + uint64_t vq_max_pending; + uint64_t vq_agg_limit; + uint64_t vq_time_shift; + uint64_t vq_ramp_rate; + avl_tree_t vq_deadline_tree; + avl_tree_t vq_read_tree; + avl_tree_t vq_write_tree; + avl_tree_t vq_pending_tree; + kmutex_t vq_lock; +}; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_ashift; /* block alignment shift */ + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + space_map_t vdev_dtl_map; /* dirty time log in-core state */ + space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + space_map_obj_t *vdev_smo; /* metaslab space map array */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + uint8_t vdev_dirty[TXG_SIZE]; /* per-txg dirty flags */ + int vdev_is_dirty; /* on config dirty list? */ + list_node_t vdev_dirty_node; /* config dirty list */ + zio_t *vdev_io_retry; /* I/O retry list */ + list_t vdev_io_pending; /* I/O pending list */ + + /* + * Leaf vdev state. + */ + uint64_t vdev_psize; /* physical device capacity */ + space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + uint64_t vdev_fault_arg; /* fault injection paramater */ + int vdev_fault_mask; /* zio types to fault */ + uint8_t vdev_fault_mode; /* fault injection mode */ + uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ + uint8_t vdev_offline; /* device taken offline? */ + uint8_t vdev_detached; /* device detached? */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_dirty_lock; /* vdev_dirty[] */ + kmutex_t vdev_io_lock; /* vdev_io_pending list */ + kcondvar_t vdev_io_cv; /* vdev_io_pending list empty? */ + kmutex_t vdev_stat_lock; /* vdev_stat */ +}; + +#define VDEV_SKIP_SIZE (8 << 10) +#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCKS ((128 << 10) >> UBERBLOCK_SHIFT) + +#define VDEV_BOOT_MAGIC 0x2f5b007b10c /* ZFS boot block */ +#define VDEV_BOOT_VERSION 1 /* version number */ + +typedef struct vdev_boot_header { + uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ + uint64_t vb_version; /* VDEV_BOOT_VERSION */ + uint64_t vb_offset; /* start offset (bytes) */ + uint64_t vb_size; /* size (bytes) */ + char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; +} vdev_boot_header_t; + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; + zio_block_tail_t vp_zbt; +} vdev_phys_t; + +typedef struct vdev_label { + char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ + vdev_boot_header_t vl_boot_header; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 120K */ + uberblock_phys_t vl_uberblock[VDEV_UBERBLOCKS]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * Size and offset of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * vdev_dirty[] flags + */ +#define VDD_ALLOC 0x01 /* allocated from in this txg */ +#define VDD_FREE 0x02 /* freed to in this txg */ +#define VDD_ADD 0x04 /* added to the pool in this txg */ +#define VDD_DTL 0x08 /* dirty time log entry in this txg */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 + +/* + * Allocate or free a vdev + */ +extern vdev_t *vdev_alloc(spa_t *spa, nvlist_t *config, vdev_t *parent, + uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern int vdev_load(vdev_t *vd, int import); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_disk_ops; +extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_missing_ops; + +/* + * Common asize function + */ +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h new file mode 100644 index 0000000000..94ad0ffebe --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zap.h @@ -0,0 +1,353 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Managemnt + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to 256 bytes (including terminating + * NULL). The value is an array of integers (whose length is limited + * only by the size of the zapobj). The integers may be 1, 2, 4, or 8 + * bytes long. Note that an 8-byte integer value can be used to store + * the location (object number) of another dmu object (which may be + * itself a zapobj). Note that you can use a zero-length attribute to + * store a single bit of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (23 bytes or less) names and short (23 bytes or less) values. + * The ZAP should be efficient enough so that the user does not need to + * cache these attributes. + * + * Using extremely long (~256 bytes or more) attribute names or values + * values will result in poor performance, due to the memcpy from the + * user's buffer into the ZAP object. This penalty can be avoided by + * creating an integer-type attribute to store an object number, and + * accessing that object using the DMU directly. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 32 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Create a new zapobj with no attributes and return its object number. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. * When converting to a + * larger integer size, the integers will be treated as unsigned (ie. no + * sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + + +/* + * Returns (in name) the name of the entry whose value + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. + */ +int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name); + +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + uint64_t zc_zapobj; + uint64_t zc_hash; + uint32_t zc_cd; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + uint64_t zs_num_leafs; /* The number of leaf blocks */ + + uint64_t zs_num_entries; /* The number of zap entries */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* The number of blocks used for large names or values */ + uint64_t zs_num_blocks_large; + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_chained[n] is the number of leafs with n + * chained blocks. zs_leafs_with_n_chained[0] (leafs with no + * chained blocks) should be very close to zs_num_leafs. + */ + uint64_t zs_leafs_with_n_chained[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h new file mode 100644 index 0000000000..6593e20a14 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zap.h> +#include <sys/zfs_context.h> +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZAP_MAGIC 0x2F52AB2AB + +#define ZAP_BLOCK_SHIFT 17 + +#define ZAP_MAXCD (uint32_t)(-1) +#define ZAP_HASHBITS 28 +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSHIFT ZAP_BLOCK_SHIFT +#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_pad[6]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + mzap_ent_phys_t mze_phys; +} mzap_ent_t; + + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<<ZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: + * + * ptrtbl fits in first block: + * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ... + * + * ptrtbl too big for first block: + * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* 1/2 the block size */ +#define ZAP_PTRTBL_MIN_SHIFT (ZAP_BLOCK_SHIFT - 3 - 1) + +/* + * TAKE NOTE: + * If zap_phys_t is modified, zap_byteswap() must be modified. + */ +typedef struct zap_phys { + uint64_t zap_block_type; /* ZBT_HEADER */ + uint64_t zap_magic; /* ZAP_MAGIC */ + + struct zap_table_phys { + uint64_t zt_blk; /* starting block number */ + uint64_t zt_numblks; /* number of blocks */ + uint64_t zt_shift; /* bits to index it */ + uint64_t zt_nextblk; /* next (larger) copy start block */ + uint64_t zt_blks_copied; /* number source blocks copied */ + } zap_ptrtbl; + + uint64_t zap_freeblk; /* the next free block */ + uint64_t zap_num_leafs; /* number of leafs */ + uint64_t zap_num_entries; /* number of entries */ + uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_pad[8181]; + uint64_t zap_leafs[1 << ZAP_PTRTBL_MIN_SHIFT]; +} zap_phys_t; + +typedef struct zap_table_phys zap_table_phys_t; + +typedef struct zap { + objset_t *zap_objset; + uint64_t zap_object; + struct dmu_buf *zap_dbuf; + krwlock_t zap_rwlock; + int zap_ismicro; + uint64_t zap_salt; + union { + struct { + zap_phys_t *zap_phys; + + /* + * zap_num_entries_mtx protects + * zap_num_entries + */ + kmutex_t zap_num_entries_mtx; + } zap_fat; + struct { + mzap_phys_t *zap_phys; + int16_t zap_num_entries; + int16_t zap_num_chunks; + int16_t zap_alloc_next; + avl_tree_t zap_avl; + } zap_micro; + } zap_u; +} zap_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +uint64_t zap_hash(zap_t *zap, const char *name); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, int fatreader, zap_t **zapp); +void zap_unlockdir(zap_t *zap); +void zap_pageout(dmu_buf_t *db, void *vmzap); + +void zap_print(zap_t *); +struct zap_leaf *zap_create_leaf(zap_t *zd, dmu_tx_t *tx); +void zap_destroy_leaf(zap_t *zap, struct zap_leaf *l, dmu_tx_t *tx); +uint64_t zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int fzap_add(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int fzap_update(zap_t *zap, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int fzap_length(zap_t *zap, const char *name, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); + +int fzap_add_cd(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h new file mode 100644 index 0000000000..aee70ae633 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h @@ -0,0 +1,204 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ + +#define ZAP_LEAF_NUMCHUNKS 5118 +#define ZAP_LEAF_ARRAY_BYTES 21 +#define ZAP_LEAF_HASH_SHIFT 12 +#define ZAP_LEAF_HASH_NUMENTRIES (1 << ZAP_LEAF_HASH_SHIFT) +#define ZAP_LLA_DATA_BYTES ((1 << ZAP_BLOCK_SHIFT) - 16) + +typedef enum zap_entry_type { + ZAP_LEAF_FREE = 253, + ZAP_LEAF_ENTRY = 252, + ZAP_LEAF_ARRAY = 251, + ZAP_LEAF_TYPE_MAX = 250 +} zap_entry_type_t; + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + uint64_t lhr_block_type; /* ZBT_LEAF */ + uint64_t lhr_next; /* next block in leaf chain */ + uint64_t lhr_prefix; + uint32_t lhr_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lhr_nfree; /* number free chunks */ + uint16_t lhr_nentries; /* number of entries */ + uint16_t lhr_prefix_len; + +#define lh_block_type l_phys->l_hdr.lhr_block_type +#define lh_magic l_phys->l_hdr.lhr_magic +#define lh_next l_phys->l_hdr.lhr_next +#define lh_prefix l_phys->l_hdr.lhr_prefix +#define lh_nfree l_phys->l_hdr.lhr_nfree +#define lh_prefix_len l_phys->l_hdr.lhr_prefix_len +#define lh_nentries l_phys->l_hdr.lhr_nentries + +/* above is accessable to zap, below is zap_leaf private */ + + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_pad2[12]; + } l_hdr; /* 2 24-byte chunks */ + + uint16_t l_hash[ZAP_LEAF_HASH_NUMENTRIES]; + /* 170 24-byte chunks plus 16 bytes leftover space */ + + union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_LEAF_ENTRY */ + uint8_t le_int_size; /* size of ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_length; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_LEAF_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; + } l_chunk[ZAP_LEAF_NUMCHUNKS]; +} zap_leaf_phys_t; + +typedef struct zap_leaf { + krwlock_t l_rwlock; /* only used on head of chain */ + uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ + struct zap_leaf *l_next; /* next in chain */ + dmu_buf_t *l_dbuf; + zap_leaf_phys_t *l_phys; +} zap_leaf_t; + + +typedef struct zap_entry_handle { + /* below is set by zap_leaf.c and is public to zap.c */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* below is private to zap_leaf.c */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_head_leaf; + zap_leaf_t *zeh_found_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + const char *name, uint64_t h, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * zap_entry_update may fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, + const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf); + +extern zap_leaf_t *zap_leaf_split(struct zap *zap, zap_leaf_t *l, dmu_tx_t *tx); + +extern int zap_leaf_merge(zap_leaf_t *l, zap_leaf_t *sibling); + +extern zap_leaf_t *zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl); + +extern int zap_leaf_advance(zap_leaf_t *l, zap_cursor_t *zc); + +extern void zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h new file mode 100644 index 0000000000..0050316eb5 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#endif +#include <sys/acl.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACCESS_UNDETERMINED -1 + +#define ACE_SLOT_CNT 6 + +typedef struct zfs_znode_acl { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_state; /* goop */ + ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_znode_acl_t; + +#define ACL_DATA_ALLOCED 0x1 + +/* + * Max ACL size is prepended deny for all entries + the + * canonical six tacked on * the end. + */ +#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6) + +typedef struct zfs_acl { + int z_slots; /* number of allocated slots for ACEs */ + int z_acl_count; + uint_t z_state; + ace_t *z_acl; +} zfs_acl_t; + +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define DISCARD 0 +#define NOALLOW 1 +#define GROUPMASK 2 +#define PASSTHROUGH 3 +#define SECURE 4 + +struct znode; + +#ifdef _KERNEL +void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, + dmu_tx_t *, cred_t *); +int zfs_getacl(struct znode *, vsecattr_t *, cred_t *); +int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *); +int zfs_setacl(struct znode *, vsecattr_t *, cred_t *); +void zfs_acl_rele(void *); +void zfs_ace_byteswap(ace_t *, int); +extern int zfs_zaccess(struct znode *, int, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +int zfs_zaccess_v4_perm(struct znode *, int, cred_t *); +void zfs_acl_free(zfs_acl_t *); +zfs_acl_t *zfs_acl_node_read(struct znode *); + +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h new file mode 100644 index 0000000000..2f0e3e792d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/note.h> +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/buf.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> +#include <sys/kobj.h> +#include <sys/conf.h> +#include <sys/disp.h> +#include <sys/debug.h> +#include <sys/random.h> +#include <sys/byteorder.h> +#include <sys/systm.h> +#include <sys/list.h> +#include <sys/uio.h> +#include <sys/dirent.h> +#include <sys/time.h> +#include <vm/seg_kmem.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/zfs_debug.h> + +#define CPU_SEQID (CPU->cpu_seqid) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h new file mode 100644 index 0000000000..78d82ccbe2 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +vnode_t *zfsctl_root(znode_t *); +void zfsctl_init(void); +void zfsctl_fini(void); + +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr); + +int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen, + fid_t *fidp); +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h new file mode 100644 index 0000000000..07eb3d2da8 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* + * ZFS debugging + */ + +#if defined(DEBUG) || !defined(_KERNEL) +#define ZFS_DEBUG +#endif + +extern int zfs_flags; + +#define ZFS_DEBUG_DPRINTF 0x0001 +#define ZFS_DEBUG_DBUF_VERIFY 0x0002 +#define ZFS_DEBUG_DNODE_VERIFY 0x0004 +#define ZFS_DEBUG_SNAPNAMES 0x0008 + +#ifdef ZFS_DEBUG +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h new file mode 100644 index 0000000000..8ab760f618 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/zfs_znode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_REPLAY 0x04 /* we are replaying intent log */ + +extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, + int); +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, int *); +extern int zfs_dirlook(znode_t *, char *, vnode_t **); +extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *, + dmu_tx_t *, cred_t *, uint_t, znode_t **, int); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_dq_add(znode_t *, dmu_tx_t *); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h new file mode 100644 index 0000000000..cbe8bbc5cb --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -0,0 +1,187 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/cred.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DRIVER_NAME "zfs" +#define ZFS_DS_TYPE "zfs" + +/* + * Property values for snapdir + */ +#define HIDDEN 0 +#define VISIBLE 1 + +typedef struct zfs_stats { + uint64_t zs_atime; + uint64_t zs_recordsize; + uint64_t zs_readonly; + uint64_t zs_devices; + uint64_t zs_exec; + uint64_t zs_setuid; + uint64_t zs_snapdir; + uint64_t zs_acl_mode; + uint64_t zs_acl_inherit; + char zs_mountpoint[MAXNAMELEN]; + char zs_atime_setpoint[MAXNAMELEN]; + char zs_recordsize_setpoint[MAXNAMELEN]; + char zs_readonly_setpoint[MAXNAMELEN]; + char zs_devices_setpoint[MAXNAMELEN]; + char zs_setuid_setpoint[MAXNAMELEN]; + char zs_exec_setpoint[MAXNAMELEN]; + char zs_mountpoint_setpoint[MAXNAMELEN]; + char zs_sharenfs[MAXNAMELEN]; + char zs_sharenfs_setpoint[MAXNAMELEN]; + char zs_snapdir_setpoint[MAXNAMELEN]; + char zs_acl_mode_setpoint[MAXNAMELEN]; + char zs_acl_inherit_setpoint[MAXNAMELEN]; +} zfs_stats_t; + +#define DMU_BACKUP_VERSION (1ULL) +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +/* + * zfs ioctl command structure + */ +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, + } drr_type; + uint32_t drr_pad; + union { + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_version; + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + } drr_begin; + struct drr_end { + uint64_t drr_checksum; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksum; + uint8_t drr_compress; + uint8_t drr_pad[6]; + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_length; + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + } drr_free; + } drr_u; +} dmu_replay_record_t; + +typedef struct zfs_cmd { + char zc_name[MAXNAMELEN]; + char zc_prop_name[MAXNAMELEN]; + char zc_prop_value[MAXPATHLEN]; + char zc_root[MAXPATHLEN]; + char zc_filename[MAXPATHLEN]; + uint32_t zc_intsz; + uint32_t zc_numints; + uint64_t zc_pool_guid; + uint64_t zc_config_src; /* really (char *) */ + uint64_t zc_config_src_size; + uint64_t zc_config_dst; /* really (char *) */ + uint64_t zc_config_dst_size; + uint64_t zc_cookie; + uint64_t zc_cred; + uint64_t zc_dev; + uint64_t zc_volsize; + uint64_t zc_volblocksize; + uint64_t zc_objset_type; + zfs_stats_t zc_zfs_stats; + dmu_object_info_t zc_object_info; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; +} zfs_cmd_t; + +#ifdef _KERNEL + +extern dev_info_t *zfs_dip; + +extern int zfs_secpolicy_write(const char *dataset, const char *, cred_t *cr); +extern int zfs_busy(void); + +extern int zvol_check_volsize(zfs_cmd_t *zc); +extern int zvol_check_volblocksize(zfs_cmd_t *zc); +extern int zvol_get_stats(zfs_cmd_t *zc, objset_t *os); +extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx); +extern int zvol_create_minor(zfs_cmd_t *zc); +extern int zvol_remove_minor(zfs_cmd_t *zc); +extern int zvol_set_volsize(zfs_cmd_t *zc); +extern int zvol_set_volblocksize(zfs_cmd_t *zc); +extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); +extern int zvol_strategy(buf_t *bp); +extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); +extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); +extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); +extern int zvol_busy(void); +extern void zvol_init(void); +extern void zvol_fini(void); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h new file mode 100644 index 0000000000..cd0700f641 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/list.h> +#include <sys/vfs.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfs_delete_list { + kmutex_t z_mutex; + kcondvar_t z_cv; + kcondvar_t z_quiesce_cv; + uint8_t z_drained; + uint8_t z_draining; + uint32_t z_thread_target; + uint32_t z_thread_count; + uint64_t z_znode_count; + list_t z_znodes; +} zfs_delete_t; + +typedef struct zfsvfs zfsvfs_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_root; /* id of root znode */ + uint64_t z_dqueue; /* delete queue */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted1; /* unmounted phase 1 */ + boolean_t z_unmounted2; /* unmounted phase 2 */ + uint32_t z_op_cnt; /* vnode/vfs operations ref count */ + krwlock_t z_um_lock; /* rw lock for umount phase 2 */ + zfs_delete_t z_delete_head; /* zfs delete list */ + list_t z_all_znodes; /* all vnodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + vnode_t *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ +}; + +/* + * The total file ID size is limited to 12 bytes (including the length + * field) in the NFSv2 protocol. For historical reasons, this same limit + * is currently being imposed by the Solaris NFSv3 implementation... + * although the protocol actually permits a maximum of 64 bytes. It will + * not be possible to expand beyond 12 bytes without abandoning support + * of NFSv2 and making some changes to the Solaris NFSv3 implementation. + * + * For the time being, we will partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h new file mode 100644 index 0000000000..d3f28df4cd --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -0,0 +1,283 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/list.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/zfs_acl.h> +#include <sys/zil.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ + +#define MASTER_NODE_OBJ 1 + +/* + * special attributes for master node. + */ + +#define ZFS_FSID "FSID" +#define ZFS_DELETE_QUEUE "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZFS_VERSION_OBJ "VERSION" +#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" +#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" + +#define ZFS_FLAG_BLOCKPERPAGE 0x1 +#define ZFS_FLAG_NOGROWBLOCKS 0x2 + +/* + * ZFS version - rev'd whenever an incompatible on-disk format change + * occurs. Independent of SPA/DMU/ZAP versioning. + */ + +#define ZFS_VERSION 1ULL + +#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) + +/* Path component length */ +/* + * The generic fs code uses MAXNAMELEN to represent + * what the largest component length is. Unfortunately, + * this length includes the terminating NULL. ZFS needs + * to tell the users via pathconf() and statvfs() what the + * true maximum length of a component is, excluding the NULL. + */ +#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) + +/* + * This is the persistent portion of the znode. It is stored + * in the "bonus buffer" of the file. Short symbolic links + * are also stored in the bonus buffer. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_pad[4]; /* 144 - future */ + zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we only use this space to store symbolic links. + */ +} znode_phys_t; + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#ifdef _KERNEL +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +struct zcache_state; + +typedef struct znode { + struct zfsvfs *z_zfsvfs; + vnode_t *z_vnode; + list_node_t z_list_node; /* deleted znodes */ + uint64_t z_id; /* object ID for this znode */ + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_map_lock; /* page map lock */ + krwlock_t z_grow_lock; /* grow block size lock */ + krwlock_t z_append_lock; /* append-mode lock */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ + uint8_t z_active; /* znode is in use */ + uint8_t z_reap; /* reap file at last reference */ + uint8_t z_atime_dirty; /* atime needs to be synced */ + uint8_t z_dbuf_held; /* Is z_dbuf already held? */ + uint_t z_mapcnt; /* number of memory maps to file */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_last_itx; /* last ZIL itx on this znode */ + kmutex_t z_acl_lock; /* acl data lock */ + list_node_t z_link_node; /* all znodes in fs link */ + list_node_t z_zcache_node; + struct zcache_state *z_zcache_state; + uint64_t z_zcache_access; + + /* + * These are dmu managed fields. + */ + znode_phys_t *z_phys; /* pointer to persistent znode */ + dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ +} znode_t; + +/* + * The grow_lock is only applicable to "regular" files. + * The parent_lock is only applicable to directories. + */ +#define z_parent_lock z_grow_lock + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((znode_t *)(VP)->v_data) + +/* + * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. + * ZFS_EXIT() must be called before exitting the vop. + */ +#define ZFS_ENTER(zfsvfs) \ + { \ + atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \ + if ((zfsvfs)->z_unmounted1) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } +#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zp) \ + (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]); + +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) + +/* + * Macros to encode/decode ZFS stored time values from/to struct timespec + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + stmp[0] = (uint64_t)(tp)->tv_sec; \ + stmp[1] = (uint64_t)(tp)->tv_nsec; \ +} + +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)stmp[0]; \ + (tp)->tv_nsec = (long)stmp[1]; \ +} + +/* + * Timestamp defines + */ +#define ACCESSED (AT_ATIME) +#define STATE_CHANGED (AT_CTIME) +#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) + +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_time_stamper(zp, ACCESSED, NULL) + +extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx); +extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern int zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *, + cred_t *cr); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern znode_t *zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_znode_free(znode_t *); +extern int zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads); +extern void zfs_delete_wait_empty(zfsvfs_t *zfsvfs); +extern void zfs_zcache_flush(zfsvfs_t *zfsvf); +extern void zfs_remove_op_tables(); +extern int zfs_create_op_tables(); +extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); + +extern uint64_t zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name); +extern uint64_t zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, char *name); +extern uint64_t zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name); +extern uint64_t zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern uint64_t zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern uint64_t zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio); +extern uint64_t zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern uint64_t zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied); +extern uint64_t zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, int aclcnt, ace_t *z_ace); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h new file mode 100644 index 0000000000..a03dcc6bc9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -0,0 +1,242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zit_pad[6]; +} zil_header_t; + +/* + * Log block trailer - structure at the end of the header and each log block + * + * The zit_bt contains a zbt_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zbt_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_trailer { + uint64_t zit_pad; + blkptr_t zit_next_blk; /* next block in chain */ + uint64_t zit_nused; /* bytes in log block used */ + zio_block_tail_t zit_bt; /* block trailer */ +} zil_trailer_t; + +#define ZIL_MIN_BLKSZ 4096 +#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE +#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) + +/* + * Intent log transaction types and record structures + */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL 12 /* Set acl */ +#define TX_MAX_TYPE 13 /* Max transaction type */ + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* intent log sequence number */ +} lr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ +} lr_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * ZFS intent log transaction structure + */ +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + uint8_t itx_data_copied; /* TX_WRITE only: write data already */ + /* copied into itx data buffer */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + +typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(); +typedef int zil_get_data_t(void *arg, lr_write_t *lr); + +extern void zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *)); +extern void zil_destroy(zilog_t *zilog); + +extern itx_t *zil_itx_create(int txtype, size_t lrsize); +extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_commit(zilog_t *zilog, uint64_t seq, int ioflag); + +extern void zil_claim(char *osname, void *txarg); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog); + +extern int zil_suspend(zilog_t *zilog); +extern void zil_resume(zilog_t *zilog); + +extern int zil_disable; +extern int zil_always; +extern int zil_purge; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h new file mode 100644 index 0000000000..6286fc5aa3 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zil.h> +#include <sys/dmu_objset.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum lwb_state_type { + UNWRITTEN, /* buffer yet to be written */ + SEQ_INCOMPLETE, /* buffer written, but there's an unwritten buffer in */ + /* the sequence before this */ + SEQ_COMPLETE, /* no unwritten buffers before this */ +} lwb_state_t; + +/* + * Log write buffer. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + char *lwb_buf; /* log write buffer */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + uint64_t lwb_seq; /* highest log record seq number */ + txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + lwb_state_t lwb_state; /* buffer state */ +} lwb_t; + +/* + * [vdev, seq] element for use in flushing device write caches + */ +typedef struct zil_vdev { + uint64_t vdev; /* device written */ + uint64_t seq; /* itx sequence */ + list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */ +} zil_vdev_t; + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + uint64_t zl_itx_seq; /* itx sequence number */ + uint64_t zl_ss_seq; /* last tx on stable storage */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_write; /* for waiting to write to log */ + kcondvar_t zl_cv_seq; /* for committing a sequence */ + uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_stop_sync; /* for debugging */ + uint8_t zl_writer; /* boolean: write setup in progress */ + uint8_t zl_log_error; /* boolean: log write error */ + list_t zl_itx_list; /* in-memory itx list */ + uint64_t zl_itx_list_sz; /* total size of records on list */ + uint64_t zl_prev_blk_sz; /* previous log block size */ + list_t zl_lwb_list; /* in-flight log write list */ + list_t zl_vdev_list; /* list of [vdev, seq] pairs */ + taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ + avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + kmutex_t zl_destroy_lock; /* serializes zil_destroy() calls */ +}; + +typedef struct zil_dva_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_dva_node_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h new file mode 100644 index 0000000000..5d3227e546 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/avl.h> +#include <sys/dkio.h> +#include <sys/fs/zfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ + +typedef struct zio_block_tail { + uint64_t zbt_magic; /* for validation, endianness */ + zio_cksum_t zbt_cksum; /* 256-bit checksum */ +} zio_block_tail_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_block_tail_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +#define ZIO_GET_DVA(zio) (&(zio)->io_bp->blk_dva[(zio)->io_dva_index]) +#define ZIO_GET_IOSIZE(zio) \ + (DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \ + SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_block_tail_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_FUNCTIONS +}; + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_FUNCTIONS +}; + +#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define ZIO_PRIORITY_NOW (zio_priority_table[0]) +#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) +#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) +#define ZIO_PRIORITY_FREE (zio_priority_table[5]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) +#define ZIO_PRIORITY_TABLE_SIZE 10 + +#define ZIO_FLAG_MUSTSUCCEED 0x0000 +#define ZIO_FLAG_CANFAIL 0x0001 +#define ZIO_FLAG_FAILFAST 0x0002 +#define ZIO_FLAG_CONFIG_HELD 0x0004 + +#define ZIO_FLAG_DONT_CACHE 0x0010 +#define ZIO_FLAG_DONT_QUEUE 0x0020 +#define ZIO_FLAG_DONT_PROPAGATE 0x0040 +#define ZIO_FLAG_DONT_RETRY 0x0080 + +#define ZIO_FLAG_PHYSICAL 0x0100 +#define ZIO_FLAG_IO_BYPASS 0x0200 +#define ZIO_FLAG_IO_REPAIR 0x0400 +#define ZIO_FLAG_SPECULATIVE 0x0800 + +#define ZIO_FLAG_RESILVER 0x1000 +#define ZIO_FLAG_SCRUB 0x2000 + +#define ZIO_FLAG_GANG_INHERIT \ + (ZIO_FLAG_CANFAIL | \ + ZIO_FLAG_FAILFAST | \ + ZIO_FLAG_CONFIG_HELD | \ + ZIO_FLAG_DONT_RETRY | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_SPECULATIVE | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB) + +#define ZIO_FLAG_VDEV_INHERIT \ + (ZIO_FLAG_GANG_INHERIT | \ + ZIO_FLAG_DONT_CACHE | \ + ZIO_FLAG_PHYSICAL) + +/* + * We'll take the unused errno 'EBADE' (from the Convergent graveyard) + * to indicate checksum errors. + */ +#define ECKSUM EBADE + +typedef struct zio zio_t; +typedef void zio_done_func_t(zio_t *zio); +typedef struct zio_transform zio_transform_t; + +extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; +extern char *zio_type_name[ZIO_TYPES]; + +struct zio { + /* Core information about this I/O */ + zio_t *io_parent; + zio_t *io_root; + spa_t *io_spa; + int io_checksum; + int io_compress; + int io_dva_index; + uint64_t io_txg; + blkptr_t *io_bp; + blkptr_t io_bp_copy; + zio_t *io_child; + zio_t *io_sibling_prev; + zio_t *io_sibling_next; + zio_transform_t *io_transform_stack; + + /* Callback info */ + zio_done_func_t *io_done; + void *io_private; + blkptr_t io_bp_orig; + + /* Data represented by this I/O */ + void *io_data; + uint64_t io_size; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + uint64_t io_offset; + uint64_t io_deadline; + uint64_t io_timestamp; + avl_node_t io_offset_node; + avl_node_t io_deadline_node; + avl_tree_t *io_vdev_tree; + zio_t *io_delegate_list; + zio_t *io_delegate_next; + zio_t *io_retry_next; + list_node_t io_pending; + + /* Internal pipeline state */ + int io_flags; + uint8_t io_type; + uint8_t io_stage; + uint8_t io_stalled; + uint8_t io_priority; + struct dk_callback io_dk_callback; + int io_cmd; + int io_retries; + int io_error; + uint32_t io_numerrors; + uint32_t io_pipeline; + uint32_t io_async_stages; + uint64_t io_children_notready; + uint64_t io_children_notdone; + void *io_waiter; + kmutex_t io_lock; + kcondvar_t io_cv; +}; + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *private, int flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, void *data, int checksum, + zio_done_func_t *done, void *private, int priority, int flags); + +extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, + blkptr_t *bp, uint64_t txg); +extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); + +/* + * Move an I/O to the next stage of the pipeline and execute that stage. + * There's no locking on io_stage because there's no legitimate way for + * multiple threads to be attempting to process the same I/O. + */ +extern void zio_next_stage(zio_t *zio); +extern void zio_next_stage_async(zio_t *zio); +extern void zio_wait_children_done(zio_t *zio); + +/* + * Delegate I/O to a child vdev. + */ +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_checksum_verified(zio_t *zio); +extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); + +extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); +extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h new file mode 100644 index 0000000000..ba3dc48d28 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); + +/* + * Information about each checksum function. + */ +typedef struct zio_checksum_info { + zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ + int ci_correctable; /* number of correctable bits */ + int ci_zbt; /* uses zio block tail? */ + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t fletcher_2_native; +extern zio_checksum_t fletcher_4_native; + +extern zio_checksum_t fletcher_2_byteswap; +extern zio_checksum_t fletcher_4_byteswap; + +extern zio_checksum_t zio_checksum_SHA256; + +extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp, + void *data, uint64_t size); +extern int zio_checksum_error(zio_t *zio); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h new file mode 100644 index 0000000000..7eddf1e8d1 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Common signature for all zio compress/decompress functions. + */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len); +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len); + +/* + * Information about each compression function. + */ +typedef struct zio_compress_info { + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; + char *ci_name; +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len); + +/* + * Compress and decompress data if necessary. + */ +extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, + void **destp, uint64_t *destsizep, uint64_t *destbufsizep); +extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h new file mode 100644 index 0000000000..0b2b07de29 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -0,0 +1,208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * I/O Groups: pipeline stage definitions. + */ + +typedef enum zio_stage { + ZIO_STAGE_OPEN = 0, /* RWFCI */ + ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ + + ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ + + ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ + ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ + ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ + ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */ + + ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ + ZIO_STAGE_DVA_FREE, /* --F-- */ + ZIO_STAGE_DVA_CLAIM, /* ---C- */ + + ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */ + + ZIO_STAGE_READY, /* RWFCI */ + + ZIO_STAGE_DVA_TRANSLATE, /* RW--- */ + + ZIO_STAGE_VDEV_IO_SETUP, /* RW--I */ + ZIO_STAGE_VDEV_IO_START, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + + ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ + + ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ + ZIO_STAGE_READ_DECOMPRESS, /* R---- */ + + ZIO_STAGE_DONE /* RWFCI */ +} zio_stage_t; + +/* + * The stages for which there's some performance value in going async. + * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. + */ +#define ZIO_ASYNC_PIPELINE_STAGES \ + ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ + (1U << ZIO_STAGE_READ_DECOMPRESS)) + +#define ZIO_VDEV_IO_PIPELINE \ + ((1U << ZIO_STAGE_VDEV_IO_SETUP) | \ + (1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_DONE) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) + +#define ZIO_READ_PHYS_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_READ_PIPELINE \ + ((1U << ZIO_STAGE_DVA_TRANSLATE) | \ + ZIO_READ_PHYS_PIPELINE) + +#define ZIO_WRITE_PHYS_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WRITE_COMMON_PIPELINE \ + ((1U << ZIO_STAGE_DVA_TRANSLATE) | \ + ZIO_WRITE_PHYS_PIPELINE) + +#define ZIO_WRITE_PIPELINE \ + ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_GANG_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ + (1U << ZIO_STAGE_READ_GANG_MEMBERS)) + +#define ZIO_REWRITE_PIPELINE \ + ((1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_WRITE_ALLOCATE_PIPELINE \ + ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ + ZIO_WRITE_COMMON_PIPELINE) + +#define ZIO_GANG_FREE_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) + +#define ZIO_FREE_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_DVA_FREE) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_CLAIM_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_GANG_PIPELINE) | \ + (1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ + (1U << ZIO_STAGE_DVA_CLAIM) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_IOCTL_PIPELINE \ + ((1U << ZIO_STAGE_OPEN) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + ZIO_VDEV_IO_PIPELINE | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ + ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ + ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_DONE)) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ + ZIO_VDEV_IO_PIPELINE) + +#define ZIO_ERROR_PIPELINE_MASK \ + ZIO_WAIT_FOR_CHILDREN_PIPELINE + +struct zio_transform { + void *zt_data; + uint64_t zt_size; + uint64_t zt_bufsize; + zio_transform_t *zt_next; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c new file mode 100644 index 0000000000..81ab16cd3d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -0,0 +1,583 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/txg_impl.h> +#include <sys/dmu_impl.h> +#include <sys/dsl_pool.h> +#include <sys/callb.h> + +/* + * Pool-wide transaction groups. + */ + +static void txg_sync_thread(dsl_pool_t *dp); +static void txg_quiesce_thread(dsl_pool_t *dp); +static void txg_timelimit_thread(dsl_pool_t *dp); + +int txg_time = 5; /* max 5 seconds worth of delta per txg */ + +/* + * Prepare the txg subsystem. + */ +void +txg_init(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + bzero(tx, sizeof (tx_state_t)); + + tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + + rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); + + tx->tx_open_txg = txg; +} + +/* + * Close down the txg subsystem. + */ +void +txg_fini(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(tx->tx_threads == 0); + + rw_destroy(&tx->tx_suspend); + + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + + bzero(tx, sizeof (tx_state_t)); +} + +/* + * Start syncing transaction groups. + */ +void +txg_sync_start(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + + dprintf("pool %p\n", dp); + + ASSERT(tx->tx_threads == 0); + + tx->tx_threads = 3; + + tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread, + dp, 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); + mutex_enter(&tx->tx_sync_lock); +} + +static void +txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) +{ + ASSERT(*tpp != NULL); + *tpp = NULL; + tx->tx_threads--; + cv_broadcast(&tx->tx_exit_cv); + CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ + thread_exit(); +} + +static void +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax) +{ + CALLB_CPR_SAFE_BEGIN(cpr); + + if (secmax) + (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + secmax * hz); + else + cv_wait(cv, &tx->tx_sync_lock); + + CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); +} + +/* + * Stop syncing transaction groups. + */ +void +txg_sync_stop(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + dprintf("pool %p\n", dp); + /* + * Finish off any work in progress. + */ + ASSERT(tx->tx_threads == 3); + txg_wait_synced(dp, 0); + + /* + * Wake all 3 sync threads (one per state) and wait for them to die. + */ + mutex_enter(&tx->tx_sync_lock); + + ASSERT(tx->tx_threads == 3); + + tx->tx_exiting = 1; + + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_timeout_exit_cv); + + while (tx->tx_threads != 0) + cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); + + tx->tx_exiting = 0; + + mutex_exit(&tx->tx_sync_lock); +} + +uint64_t +txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) +{ + tx_state_t *tx = &dp->dp_tx; + tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; + uint64_t txg; + + mutex_enter(&tc->tc_lock); + + txg = tx->tx_open_txg; + tc->tc_count[txg & TXG_MASK]++; + + th->th_cpu = tc; + th->th_txg = txg; + + return (txg); +} + +void +txg_rele_to_quiesce(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + + mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + ASSERT(tc->tc_count[g] != 0); + if (--tc->tc_count[g] == 0) + cv_broadcast(&tc->tc_cv[g]); + mutex_exit(&tc->tc_lock); + + th->th_cpu = NULL; /* defensive */ +} + +static void +txg_quiesce(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int g = txg & TXG_MASK; + int c; + + /* + * Grab all tx_cpu locks so nobody else can get into this txg. + */ + for (c = 0; c < max_ncpus; c++) + mutex_enter(&tx->tx_cpu[c].tc_lock); + + ASSERT(txg == tx->tx_open_txg); + tx->tx_open_txg++; + + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_lock); + + /* + * Quiesce the transaction group by waiting for everyone to txg_exit(). + */ + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } +} + +static void +txg_sync_thread(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We sync when there's someone waiting on us, or the + * quiesce thread has handed off a txg to us. + */ + while (!tx->tx_exiting && + tx->tx_synced_txg >= tx->tx_sync_txg_waiting && + tx->tx_quiesced_txg == 0) { + dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0); + } + + /* + * Wait until the quiesce thread hands off a txg to us, + * prompting it to do so if necessary. + */ + while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + + rw_enter(&tx->tx_suspend, RW_WRITER); + + /* + * Consume the quiesced txg which has been handed off to + * us. This may cause the quiescing thread to now be + * able to quiesce another txg, so we must signal it. + */ + txg = tx->tx_quiesced_txg; + tx->tx_quiesced_txg = 0; + tx->tx_syncing_txg = txg; + cv_broadcast(&tx->tx_quiesce_more_cv); + rw_exit(&tx->tx_suspend); + + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + spa_sync(dp->dp_spa, txg); + mutex_enter(&tx->tx_sync_lock); + rw_enter(&tx->tx_suspend, RW_WRITER); + tx->tx_synced_txg = txg; + tx->tx_syncing_txg = 0; + rw_exit(&tx->tx_suspend); + cv_broadcast(&tx->tx_sync_done_cv); + } +} + +static void +txg_quiesce_thread(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We quiesce when there's someone waiting on us. + * However, we can only have one txg in "quiescing" or + * "quiesced, waiting to sync" state. So we wait until + * the "quiesced, waiting to sync" txg has been consumed + * by the sync thread. + */ + while (!tx->tx_exiting && + (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || + tx->tx_quiesced_txg != 0)) + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); + + txg = tx->tx_open_txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + txg_quiesce(dp, txg); + mutex_enter(&tx->tx_sync_lock); + + /* + * Hand this txg off to the sync thread. + */ + dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiesced_txg = txg; + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + } +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 3); + if (txg == 0) + txg = tx->tx_open_txg; + if (tx->tx_sync_txg_waiting < txg) + tx->tx_sync_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_synced_txg < txg) { + dprintf("broadcasting sync more " + "tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + cv_broadcast(&tx->tx_sync_more_cv); + cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +void +txg_wait_open(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + ASSERT(tx->tx_threads == 3); + if (txg == 0) + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_open_txg < txg) { + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_timelimit_thread(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + while (!tx->tx_exiting) { + uint64_t txg = tx->tx_open_txg + 1; + + txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time); + + if (tx->tx_quiesce_txg_waiting < txg) + tx->tx_quiesce_txg_waiting = txg; + + while (!tx->tx_exiting && tx->tx_open_txg < txg) { + dprintf("pushing out %llu\n", txg); + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + } + txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread); +} + +int +txg_stalled(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); +} + +void +txg_suspend(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + /* XXX some code paths suspend when they are already suspended! */ + rw_enter(&tx->tx_suspend, RW_READER); +} + +void +txg_resume(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + rw_exit(&tx->tx_suspend); +} + +/* + * Per-txg object lists. + */ +void +txg_list_create(txg_list_t *tl, size_t offset) +{ + int t; + + mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); + + tl->tl_offset = offset; + + for (t = 0; t < TXG_SIZE; t++) + tl->tl_head[t] = NULL; +} + +void +txg_list_destroy(txg_list_t *tl) +{ + int t; + + for (t = 0; t < TXG_SIZE; t++) + ASSERT(txg_list_empty(tl, t)); + + mutex_destroy(&tl->tl_lock); +} + +int +txg_list_empty(txg_list_t *tl, uint64_t txg) +{ + return (tl->tl_head[txg & TXG_MASK] == NULL); +} + +/* + * Add an entry to the list. + * Returns 0 if it's a new entry, 1 if it's already there. + */ +int +txg_list_add(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + int already_on_list; + + mutex_enter(&tl->tl_lock); + already_on_list = tn->tn_member[t]; + if (!already_on_list) { + tn->tn_member[t] = 1; + tn->tn_next[t] = tl->tl_head[t]; + tl->tl_head[t] = tn; + } + mutex_exit(&tl->tl_lock); + + return (already_on_list); +} + +/* + * Remove the head of the list and return it. + */ +void * +txg_list_remove(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + void *p = NULL; + + mutex_enter(&tl->tl_lock); + if ((tn = tl->tl_head[t]) != NULL) { + p = (char *)tn - tl->tl_offset; + tl->tl_head[t] = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + } + mutex_exit(&tl->tl_lock); + + return (p); +} + +/* + * Remove a specific item from the list and return it. + */ +void * +txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn, **tp; + + mutex_enter(&tl->tl_lock); + + for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { + if ((char *)tn - tl->tl_offset == p) { + *tp = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + mutex_exit(&tl->tl_lock); + return (p); + } + } + + mutex_exit(&tl->tl_lock); + + return (NULL); +} + +int +txg_list_member(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + return (tn->tn_member[t]); +} + +/* + * Walk a txg list -- only safe if you know it's not changing. + */ +void * +txg_list_head(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = tl->tl_head[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +void * +txg_list_next(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + tn = tn->tn_next[t]; + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c new file mode 100644 index 0000000000..63bff0ae4b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/uberblock.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/uberblock_impl.h> +#include <sys/vdev_impl.h> + +/* Keep the uberblock version in a varialbe so we can get at it with mdb */ +static uint64_t uberblock_version = UBERBLOCK_VERSION; + +int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) + byteswap_uint64_array(ub, sizeof (uberblock_t)); + + if (ub->ub_magic != UBERBLOCK_MAGIC) + return (EINVAL); + + if (ub->ub_version != UBERBLOCK_VERSION) + return (ENOTSUP); + + return (0); +} + +/* + * Update the uberblock and return a boolean value indicating whether + * anything changed in this transaction group. + */ +int +uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) +{ + ASSERT(ub->ub_txg < txg); + + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_version = UBERBLOCK_VERSION; + ub->ub_txg = txg; + ub->ub_guid_sum = rvd->vdev_guid_sum; + ub->ub_timestamp = gethrestime_sec(); + + return (ub->ub_rootbp.blk_birth == txg); +} diff --git a/usr/src/uts/common/fs/zfs/unique.c b/usr/src/uts/common/fs/zfs/unique.c new file mode 100644 index 0000000000..56fbddd78e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/unique.c @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/avl.h> +#include <sys/unique.h> + +static avl_tree_t unique_avl; +static kmutex_t unique_mtx; + +typedef struct unique { + avl_node_t un_link; + uint64_t un_value; +} unique_t; + +#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) + +static int +unique_compare(const void *a, const void *b) +{ + const unique_t *una = a; + const unique_t *unb = b; + + if (una->un_value < unb->un_value) + return (-1); + if (una->un_value > unb->un_value) + return (+1); + return (0); +} + +void +unique_init(void) +{ + avl_create(&unique_avl, unique_compare, + sizeof (unique_t), offsetof(unique_t, un_link)); +} + +uint64_t +unique_create(void) +{ + return (unique_insert(0)); +} + +uint64_t +unique_insert(uint64_t value) +{ + avl_index_t idx; + unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); + + un->un_value = value; + + mutex_enter(&unique_mtx); + while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || + avl_find(&unique_avl, un, &idx)) { + mutex_exit(&unique_mtx); + (void) random_get_pseudo_bytes((void*)&un->un_value, + sizeof (un->un_value)); + un->un_value &= UNIQUE_MASK; + mutex_enter(&unique_mtx); + } + + avl_insert(&unique_avl, un, idx); + mutex_exit(&unique_mtx); + + return (un->un_value); +} + +void +unique_remove(uint64_t value) +{ + unique_t un_tofind; + unique_t *un; + + un_tofind.un_value = value; + mutex_enter(&unique_mtx); + un = avl_find(&unique_avl, &un_tofind, NULL); + if (un != NULL) { + avl_remove(&unique_avl, un); + kmem_free(un, sizeof (unique_t)); + } + mutex_exit(&unique_mtx); +} diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c new file mode 100644 index 0000000000..990c690bff --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -0,0 +1,1738 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/space_map.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device management. + */ + +static vdev_ops_t *vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_disk_ops, + &vdev_file_ops, + &vdev_missing_ops, + NULL +}; + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, **opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift); + uint64_t csize; + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + asize = MAX(asize, csize); + } + + return (asize); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + return (rvd->vdev_child[vdev]); + + return (NULL); +} + +vdev_t * +vdev_lookup_by_path(vdev_t *vd, const char *path) +{ + int c; + vdev_t *mvd; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + int c; + vdev_t *mvd; + + if (vd->vdev_children == 0 && vd->vdev_guid == guid) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + + ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + ASSERT(cvd->vdev_parent == NULL); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_zalloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + bcopy(pvd->vdev_child, newchild, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; +} + +void +vdev_remove_child(vdev_t *pvd, vdev_t *cvd) +{ + int c; + uint_t id = cvd->vdev_id; + + ASSERT(cvd->vdev_parent == pvd); + + if (pvd == NULL) + return; + + ASSERT(id < pvd->vdev_children); + ASSERT(pvd->vdev_child[id] == cvd); + + pvd->vdev_child[id] = NULL; + cvd->vdev_parent = NULL; + + for (c = 0; c < pvd->vdev_children; c++) + if (pvd->vdev_child[c]) + break; + + if (c == pvd->vdev_children) { + kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); + pvd->vdev_child = NULL; + pvd->vdev_children = 0; + } + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum -= cvd->vdev_guid_sum; +} + +/* + * Remove any holes in the child array. + */ +void +vdev_compact_children(vdev_t *pvd) +{ + vdev_t **newchild, *cvd; + int oldc = pvd->vdev_children; + int newc, c; + + ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); + + for (c = newc = 0; c < oldc; c++) + if (pvd->vdev_child[c]) + newc++; + + newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); + + for (c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } + } + + kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); + pvd->vdev_child = newchild; + pvd->vdev_children = newc; +} + +/* + * Allocate and minimally initialize a vdev_t. + */ +static vdev_t * +vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) +{ + vdev_t *vd; + + while (guid == 0) + guid = spa_get_random(-1ULL); + + vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + + vd->vdev_spa = spa; + vd->vdev_id = id; + vd->vdev_guid = guid; + vd->vdev_guid_sum = guid; + vd->vdev_ops = ops; + vd->vdev_state = VDEV_STATE_CLOSED; + + mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL); + list_create(&vd->vdev_io_pending, sizeof (zio_t), + offsetof(zio_t, io_pending)); + mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); + space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); + space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + txg_list_create(&vd->vdev_ms_list, + offsetof(struct metaslab, ms_txg_node)); + txg_list_create(&vd->vdev_dtl_list, + offsetof(struct vdev, vdev_dtl_node)); + vd->vdev_stat.vs_timestamp = gethrtime(); + + return (vd); +} + +/* + * Free a vdev_t that has been removed from service. + */ +static void +vdev_free_common(vdev_t *vd) +{ + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dirty_lock); + list_destroy(&vd->vdev_io_pending); + mutex_destroy(&vd->vdev_io_lock); + cv_destroy(&vd->vdev_io_cv); + + kmem_free(vd, sizeof (vdev_t)); +} + +/* + * Allocate a new vdev. The 'alloctype' is used to control whether we are + * creating a new vdev or loading an existing one - the behavior is slightly + * different for each case. + */ +vdev_t * +vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) +{ + vdev_ops_t *ops; + char *type; + uint64_t guid = 0; + vdev_t *vd; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (NULL); + + if ((ops = vdev_getops(type)) == NULL) + return (NULL); + + /* + * If this is a load, get the vdev guid from the nvlist. + * Otherwise, vdev_alloc_common() will generate one for us. + */ + if (alloctype == VDEV_ALLOC_LOAD) { + uint64_t label_id; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || + label_id != id) + return (NULL); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (NULL); + } + + vd = vdev_alloc_common(spa, id, guid, ops); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + + /* + * If we're a top-level vdev, try to load the allocation parameters. + */ + if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + &vd->vdev_ms_array); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + &vd->vdev_ms_shift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, + &vd->vdev_ashift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, + &vd->vdev_asize); + } + + /* + * If we're a leaf vdev, try to load the DTL object. + */ + if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl.smo_object); + } + + /* + * Add ourselves to the parent's list of children. + */ + vdev_add_child(parent, vd); + + return (vd); +} + +void +vdev_free(vdev_t *vd) +{ + int c; + + /* + * vdev_free() implies closing the vdev first. This is simpler than + * trying to ensure complicated semantics for all callers. + */ + vdev_close(vd); + + /* + * It's possible to free a vdev that's been added to the dirty + * list when in the middle of spa_vdev_add(). Handle that case + * correctly here. + */ + if (vd->vdev_is_dirty) + vdev_config_clean(vd); + + /* + * Free all children. + */ + for (c = 0; c < vd->vdev_children; c++) + vdev_free(vd->vdev_child[c]); + + ASSERT(vd->vdev_child == NULL); + ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + + /* + * Discard allocation state. + */ + if (vd == vd->vdev_top) + vdev_metaslab_fini(vd); + + ASSERT3U(vd->vdev_stat.vs_space, ==, 0); + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); + + /* + * Remove this vdev from its parent's child list. + */ + vdev_remove_child(vd->vdev_parent, vd); + + ASSERT(vd->vdev_parent == NULL); + + vdev_free_common(vd); +} + +/* + * Transfer top-level vdev state from svd to tvd. + */ +static void +vdev_top_transfer(vdev_t *svd, vdev_t *tvd) +{ + spa_t *spa = svd->vdev_spa; + metaslab_t *msp; + vdev_t *vd; + int t; + + ASSERT(tvd == tvd->vdev_top); + + tvd->vdev_ms_array = svd->vdev_ms_array; + tvd->vdev_ms_shift = svd->vdev_ms_shift; + tvd->vdev_ms_count = svd->vdev_ms_count; + + svd->vdev_ms_array = 0; + svd->vdev_ms_shift = 0; + svd->vdev_ms_count = 0; + + tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_mg->mg_vd = tvd; + tvd->vdev_ms = svd->vdev_ms; + tvd->vdev_smo = svd->vdev_smo; + + svd->vdev_mg = NULL; + svd->vdev_ms = NULL; + svd->vdev_smo = NULL; + + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; + tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; + + svd->vdev_stat.vs_alloc = 0; + svd->vdev_stat.vs_space = 0; + + for (t = 0; t < TXG_SIZE; t++) { + while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_ms_list, msp, t); + while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); + if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) + (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); + tvd->vdev_dirty[t] = svd->vdev_dirty[t]; + svd->vdev_dirty[t] = 0; + } + + if (svd->vdev_is_dirty) { + vdev_config_clean(svd); + vdev_config_dirty(tvd); + } + + ASSERT(svd->vdev_io_retry == NULL); + ASSERT(list_is_empty(&svd->vdev_io_pending)); +} + +static void +vdev_top_update(vdev_t *tvd, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + vd->vdev_top = tvd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_top_update(tvd, vd->vdev_child[c]); +} + +/* + * Add a mirror/replacing vdev above an existing vdev. + */ +vdev_t * +vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) +{ + spa_t *spa = cvd->vdev_spa; + vdev_t *pvd = cvd->vdev_parent; + vdev_t *mvd; + + ASSERT(spa_config_held(spa, RW_WRITER)); + + mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + vdev_remove_child(pvd, cvd); + vdev_add_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_children; + vdev_add_child(mvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_state = cvd->vdev_state; + + if (mvd == mvd->vdev_top) + vdev_top_transfer(cvd, mvd); + + return (mvd); +} + +/* + * Remove a 1-way mirror/replacing vdev from the tree. + */ +void +vdev_remove_parent(vdev_t *cvd) +{ + vdev_t *mvd = cvd->vdev_parent; + vdev_t *pvd = mvd->vdev_parent; + + ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + + ASSERT(mvd->vdev_children == 1); + ASSERT(mvd->vdev_ops == &vdev_mirror_ops || + mvd->vdev_ops == &vdev_replacing_ops); + + vdev_remove_child(mvd, cvd); + vdev_remove_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (cvd == cvd->vdev_top) + vdev_top_transfer(mvd, cvd); + + ASSERT(mvd->vdev_children == 0); + vdev_free(mvd); +} + +void +vdev_metaslab_init(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + metaslab_class_t *mc = spa_metaslab_class_select(spa); + uint64_t c; + uint64_t oldc = vd->vdev_ms_count; + uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; + space_map_obj_t *smo = vd->vdev_smo; + metaslab_t **mspp = vd->vdev_ms; + + dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); + + ASSERT(oldc <= newc); + + vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP); + vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + vd->vdev_ms_count = newc; + + if (vd->vdev_mg == NULL) { + if (txg == 0) { + dmu_buf_t *db; + uint64_t *ms_array; + + ms_array = kmem_zalloc(newc * sizeof (uint64_t), + KM_SLEEP); + + dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, + 0, newc * sizeof (uint64_t), ms_array); + + for (c = 0; c < newc; c++) { + if (ms_array[c] == 0) + continue; + db = dmu_bonus_hold(spa->spa_meta_objset, + ms_array[c]); + dmu_buf_read(db); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(db->db_data, &vd->vdev_smo[c], + db->db_size); + ASSERT3U(vd->vdev_smo[c].smo_object, ==, + ms_array[c]); + dmu_buf_rele(db); + } + kmem_free(ms_array, newc * sizeof (uint64_t)); + } + vd->vdev_mg = metaslab_group_create(mc, vd); + } + + for (c = 0; c < oldc; c++) { + vd->vdev_smo[c] = smo[c]; + vd->vdev_ms[c] = mspp[c]; + mspp[c]->ms_smo = &vd->vdev_smo[c]; + } + + for (c = oldc; c < newc; c++) + metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c], + c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); + + if (oldc != 0) { + kmem_free(smo, oldc * sizeof (*smo)); + kmem_free(mspp, oldc * sizeof (*mspp)); + } + +} + +void +vdev_metaslab_fini(vdev_t *vd) +{ + uint64_t m; + uint64_t count = vd->vdev_ms_count; + + if (vd->vdev_ms != NULL) { + for (m = 0; m < count; m++) + metaslab_fini(vd->vdev_ms[m]); + kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); + vd->vdev_ms = NULL; + } + + if (vd->vdev_smo != NULL) { + kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t)); + vd->vdev_smo = NULL; + } +} + +/* + * Prepare a virtual device for access. + */ +int +vdev_open(vdev_t *vd) +{ + int error; + vdev_knob_t *vk; + int c; + uint64_t osize = 0; + uint64_t asize, psize; + uint64_t ashift = -1ULL; + + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || + vd->vdev_state == VDEV_STATE_CANT_OPEN || + vd->vdev_state == VDEV_STATE_OFFLINE); + + if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) + vd->vdev_fault_arg >>= 1; + else + vd->vdev_fault_mode = VDEV_FAULT_NONE; + + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + + for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) { + uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset); + + *valp = vk->vk_default; + *valp = MAX(*valp, vk->vk_min); + *valp = MIN(*valp, vk->vk_max); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_cache_init(vd); + vdev_queue_init(vd); + vd->vdev_cache_active = B_TRUE; + } + + if (vd->vdev_offline) { + ASSERT(vd->vdev_children == 0); + dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd)); + vd->vdev_state = VDEV_STATE_OFFLINE; + return (ENXIO); + } + + error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + + dprintf("%s = %d, osize %llu, state = %d\n", + vdev_description(vd), error, osize, vd->vdev_state); + + if (error) { + dprintf("%s in %s failed to open, error %d, aux %d\n", + vdev_description(vd), + vdev_description(vd->vdev_parent), + error, + vd->vdev_stat.vs_aux); + + vd->vdev_state = VDEV_STATE_CANT_OPEN; + return (error); + } + + vd->vdev_state = VDEV_STATE_HEALTHY; + + for (c = 0; c < vd->vdev_children; c++) + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) + vd->vdev_state = VDEV_STATE_DEGRADED; + + osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + + if (vd->vdev_children == 0) { + if (osize < SPA_MINDEVSIZE) { + vd->vdev_state = VDEV_STATE_CANT_OPEN; + vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; + return (EOVERFLOW); + } + psize = osize; + asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + } else { + if (osize < SPA_MINDEVSIZE - + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + vd->vdev_state = VDEV_STATE_CANT_OPEN; + vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; + return (EOVERFLOW); + } + psize = 0; + asize = osize; + } + + vd->vdev_psize = psize; + + if (vd->vdev_asize == 0) { + /* + * This is the first-ever open, so use the computed values. + */ + vd->vdev_asize = asize; + vd->vdev_ashift = ashift; + } else { + /* + * Make sure the alignment requirement hasn't increased. + */ + if (ashift > vd->vdev_ashift) { + dprintf("%s: ashift grew\n", vdev_description(vd)); + vd->vdev_state = VDEV_STATE_CANT_OPEN; + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Make sure the device hasn't shrunk. + */ + if (asize < vd->vdev_asize) { + dprintf("%s: device shrank\n", vdev_description(vd)); + vd->vdev_state = VDEV_STATE_CANT_OPEN; + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + asize > vd->vdev_asize) { + dprintf("%s: device grew\n", vdev_description(vd)); + vd->vdev_asize = asize; + } + } + + return (0); +} + +/* + * Close a virtual device. + */ +void +vdev_close(vdev_t *vd) +{ + ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL); + + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_cache_active) { + vdev_cache_fini(vd); + vdev_queue_fini(vd); + vd->vdev_cache_active = B_FALSE; + } + + if (vd->vdev_offline) + vd->vdev_state = VDEV_STATE_OFFLINE; + else + vd->vdev_state = VDEV_STATE_CLOSED; +} + +void +vdev_reopen(vdev_t *vd, zio_t **rq) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int c; + + if (vd == rvd) { + ASSERT(rq == NULL); + for (c = 0; c < rvd->vdev_children; c++) + vdev_reopen(rvd->vdev_child[c], NULL); + return; + } + + /* only valid for top-level vdevs */ + ASSERT3P(vd, ==, vd->vdev_top); + + /* + * vdev_state can change when spa_config_lock is held as writer, + * or when it's held as reader and we're doing a vdev_reopen(). + * To handle the latter case, we grab rvd's io_lock to serialize + * reopens. This ensures that there's never more than one vdev + * state changer active at a time. + */ + mutex_enter(&rvd->vdev_io_lock); + + mutex_enter(&vd->vdev_io_lock); + while (list_head(&vd->vdev_io_pending) != NULL) + cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock); + vdev_close(vd); + (void) vdev_open(vd); + if (rq != NULL) { + *rq = vd->vdev_io_retry; + vd->vdev_io_retry = NULL; + } + mutex_exit(&vd->vdev_io_lock); + + /* + * Reassess root vdev's health. + */ + rvd->vdev_state = VDEV_STATE_HEALTHY; + for (c = 0; c < rvd->vdev_children; c++) { + uint64_t state = rvd->vdev_child[c]->vdev_state; + rvd->vdev_state = MIN(rvd->vdev_state, state); + } + + mutex_exit(&rvd->vdev_io_lock); +} + +int +vdev_create(vdev_t *vd, uint64_t txg) +{ + int error; + + /* + * Normally, partial opens (e.g. of a mirror) are allowed. + * For a create, however, we want to fail the request if + * there are any components we can't open. + */ + error = vdev_open(vd); + + if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { + vdev_close(vd); + return (error ? error : ENXIO); + } + + /* + * Recursively initialize all labels. + */ + if ((error = vdev_label_init(vd, txg)) != 0) { + vdev_close(vd); + return (error); + } + + return (0); +} + +/* + * The is the latter half of vdev_create(). It is distinct because it + * involves initiating transactions in order to do metaslab creation. + * For creation, we want to try to create all vdevs at once and then undo it + * if anything fails; this is much harder if we have pending transactions. + */ +void +vdev_init(vdev_t *vd, uint64_t txg) +{ + /* + * Aim for roughly 200 metaslabs per vdev. + */ + vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); + vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); + + /* + * Initialize the vdev's metaslabs. + */ + vdev_metaslab_init(vd, txg); +} + +void +vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg) +{ + vdev_t *tvd = vd->vdev_top; + + mutex_enter(&tvd->vdev_dirty_lock); + if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) { + tvd->vdev_dirty[txg & TXG_MASK] |= flags; + (void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list, + tvd, txg); + } + mutex_exit(&tvd->vdev_dirty_lock); +} + +void +vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +{ + mutex_enter(sm->sm_lock); + if (!space_map_contains(sm, txg, size)) + space_map_add(sm, txg, size); + mutex_exit(sm->sm_lock); +} + +int +vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +{ + int dirty; + + /* + * Quick test without the lock -- covers the common case that + * there are no dirty time segments. + */ + if (sm->sm_space == 0) + return (0); + + mutex_enter(sm->sm_lock); + dirty = space_map_contains(sm, txg, size); + mutex_exit(sm->sm_lock); + + return (dirty); +} + +/* + * Reassess DTLs after a config change or scrub completion. + */ +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +{ + int c; + + ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER)); + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + /* + * We're successfully scrubbed everything up to scrub_txg. + * Therefore, excise all old DTLs up to that point, then + * fold in the DTLs for everything we couldn't scrub. + */ + if (scrub_txg != 0) { + space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); + space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + } + if (scrub_done) + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + if (txg != 0) { + vdev_t *tvd = vd->vdev_top; + vdev_dirty(tvd, VDD_DTL, txg); + (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); + } + return; + } + + mutex_enter(&vd->vdev_dtl_lock); + space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); + mutex_enter(&vd->vdev_dtl_lock); + space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); + space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + } +} + +static int +vdev_dtl_load(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + dmu_buf_t *db; + int error; + + ASSERT(vd->vdev_children == 0); + + if (smo->smo_object == 0) + return (0); + + db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); + dmu_buf_read(db); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(db->db_data, smo, db->db_size); + dmu_buf_rele(db); + + mutex_enter(&vd->vdev_dtl_lock); + error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC, + spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc); + mutex_exit(&vd->vdev_dtl_lock); + + return (error); +} + +void +vdev_dtl_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + space_map_obj_t *smo = &vd->vdev_dtl; + space_map_t *sm = &vd->vdev_dtl_map; + space_map_t smsync; + kmutex_t smlock; + avl_tree_t *t = &sm->sm_root; + space_seg_t *ss; + dmu_buf_t *db; + dmu_tx_t *tx; + + dprintf("%s in txg %llu pass %d\n", + vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + if (vd->vdev_detached) { + if (smo->smo_object != 0) { + int err = dmu_object_free(spa->spa_meta_objset, + smo->smo_object, tx); + ASSERT3U(err, ==, 0); + smo->smo_object = 0; + } + dmu_tx_commit(tx); + return; + } + + if (smo->smo_object == 0) { + ASSERT(smo->smo_objsize == 0); + ASSERT(smo->smo_alloc == 0); + smo->smo_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, + DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); + ASSERT(smo->smo_object != 0); + vdev_config_dirty(vd->vdev_top); + } + + dmu_free_range(spa->spa_meta_objset, smo->smo_object, + 0, smo->smo_objsize, tx); + + mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); + + space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, + &smlock); + + mutex_enter(&smlock); + + mutex_enter(&vd->vdev_dtl_lock); + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) + space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start); + mutex_exit(&vd->vdev_dtl_lock); + + smo->smo_objsize = 0; + smo->smo_alloc = smsync.sm_space; + + space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx); + space_map_destroy(&smsync); + + mutex_exit(&smlock); + mutex_destroy(&smlock); + + db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); + dmu_buf_will_dirty(db, tx); + ASSERT3U(db->db_size, ==, sizeof (*smo)); + bcopy(smo, db->db_data, db->db_size); + dmu_buf_rele(db); + + dmu_tx_commit(tx); +} + +int +vdev_load(vdev_t *vd, int import) +{ + spa_t *spa = vd->vdev_spa; + int c, error; + nvlist_t *label; + uint64_t guid, state; + + dprintf("loading %s\n", vdev_description(vd)); + + /* + * Recursively load all children. + */ + for (c = 0; c < vd->vdev_children; c++) + if ((error = vdev_load(vd->vdev_child[c], import)) != 0) + return (error); + + /* + * If this is a leaf vdev, make sure its agrees with its disk labels. + */ + if (vd->vdev_ops->vdev_op_leaf) { + + if (vdev_is_dead(vd)) + return (0); + + /* + * XXX state transitions don't propagate to parent here. + * Also, merely setting the state isn't sufficient because + * it's not persistent; a vdev_reopen() would make us + * forget all about it. + */ + if ((label = vdev_label_read_config(vd)) == NULL) { + dprintf("can't load label config\n"); + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &guid) != 0 || guid != spa_guid(spa)) { + dprintf("bad or missing pool GUID (%llu)\n", guid); + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) || + guid != vd->vdev_guid) { + dprintf("bad or missing vdev guid (%llu != %llu)\n", + guid, vd->vdev_guid); + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + /* + * If we find a vdev with a matching pool guid and vdev guid, + * but the pool state is not active, it indicates that the user + * exported or destroyed the pool without affecting the config + * cache (if / was mounted readonly, for example). In this + * case, immediately return EBADF so the caller can remove it + * from the config. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state)) { + dprintf("missing pool state\n"); + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (0); + } + + if (state != POOL_STATE_ACTIVE && + (!import || state != POOL_STATE_EXPORTED)) { + dprintf("pool state not active (%llu)\n", state); + nvlist_free(label); + return (EBADF); + } + + nvlist_free(label); + } + + /* + * If this is a top-level vdev, make sure its allocation parameters + * exist and initialize its metaslabs. + */ + if (vd == vd->vdev_top) { + + if (vd->vdev_ms_array == 0 || + vd->vdev_ms_shift == 0 || + vd->vdev_ashift == 0 || + vd->vdev_asize == 0) { + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (0); + } + + vdev_metaslab_init(vd, 0); + } + + /* + * If this is a leaf vdev, load its DTL. + */ + if (vd->vdev_ops->vdev_op_leaf) { + error = vdev_dtl_load(vd); + if (error) { + dprintf("can't load DTL for %s, error %d\n", + vdev_description(vd), error); + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (0); + } + } + + return (0); +} + +void +vdev_sync_done(vdev_t *vd, uint64_t txg) +{ + metaslab_t *msp; + + dprintf("%s txg %llu\n", vdev_description(vd), txg); + + while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + metaslab_sync_done(msp, txg); +} + +void +vdev_add_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + ASSERT(vd == vd->vdev_top); + + if (vd->vdev_ms_array == 0) + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + + ASSERT(vd->vdev_ms_array != 0); + + vdev_config_dirty(vd); + + dmu_tx_commit(tx); +} + +void +vdev_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *lvd; + metaslab_t *msp; + uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK]; + uint8_t dirty = *dirtyp; + + mutex_enter(&vd->vdev_dirty_lock); + *dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL); + mutex_exit(&vd->vdev_dirty_lock); + + dprintf("%s txg %llu pass %d\n", + vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); + + if (dirty & VDD_ADD) + vdev_add_sync(vd, txg); + + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) + metaslab_sync(msp, txg); + + while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) + vdev_dtl_sync(lvd, txg); + + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); +} + +uint64_t +vdev_psize_to_asize(vdev_t *vd, uint64_t psize) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize)); +} + +void +vdev_io_start(zio_t *zio) +{ + zio->io_vd->vdev_ops->vdev_op_io_start(zio); +} + +void +vdev_io_done(zio_t *zio) +{ + zio->io_vd->vdev_ops->vdev_op_io_done(zio); +} + +const char * +vdev_description(vdev_t *vd) +{ + if (vd == NULL || vd->vdev_ops == NULL) + return ("<unknown>"); + + if (vd->vdev_path != NULL) + return (vd->vdev_path); + + if (vd->vdev_parent == NULL) + return (spa_name(vd->vdev_spa)); + + return (vd->vdev_ops->vdev_op_type); +} + +int +vdev_online(spa_t *spa, const char *path) +{ + vdev_t *vd; + + spa_config_enter(spa, RW_WRITER); + + if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { + spa_config_exit(spa); + return (ENODEV); + } + + dprintf("ONLINE: %s\n", vdev_description(vd)); + + vd->vdev_offline = B_FALSE; + + /* + * Clear the error counts. The idea is that you expect to see all + * zeroes when everything is working, so if you've just onlined a + * device, you don't want to keep hearing about errors from before. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + vdev_reopen(vd->vdev_top, NULL); + + spa_config_exit(spa); + + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + return (0); +} + +int +vdev_offline(spa_t *spa, const char *path) +{ + vdev_t *vd; + + spa_config_enter(spa, RW_WRITER); + + if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { + spa_config_exit(spa); + return (ENODEV); + } + + dprintf("OFFLINE: %s\n", vdev_description(vd)); + + /* + * If this device's top-level vdev has a non-empty DTL, + * don't allow the device to be offlined. + * + * XXX -- we should make this more precise by allowing the offline + * as long as the remaining devices don't have any DTL holes. + */ + if (vd->vdev_top->vdev_dtl_map.sm_space != 0) { + spa_config_exit(spa); + return (EBUSY); + } + + /* + * Set this device to offline state and reopen its top-level vdev. + * If this action results in the top-level vdev becoming unusable, + * undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; + vdev_reopen(vd->vdev_top, NULL); + if (vdev_is_dead(vd->vdev_top)) { + vd->vdev_offline = B_FALSE; + vdev_reopen(vd->vdev_top, NULL); + spa_config_exit(spa); + return (EBUSY); + } + + spa_config_exit(spa); + + return (0); +} + +int +vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg) +{ + vdev_t *vd; + + spa_config_enter(spa, RW_WRITER); + + if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { + spa_config_exit(spa); + return (ENODEV); + } + + vd->vdev_fault_mode = mode; + vd->vdev_fault_mask = mask; + vd->vdev_fault_arg = arg; + + spa_config_exit(spa); + + return (0); +} + +int +vdev_is_dead(vdev_t *vd) +{ + return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); +} + +int +vdev_error_inject(vdev_t *vd, zio_t *zio) +{ + int error = 0; + + if (vd->vdev_fault_mode == VDEV_FAULT_NONE) + return (0); + + if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) + return (0); + + switch (vd->vdev_fault_mode) { + case VDEV_FAULT_RANDOM: + if (spa_get_random(vd->vdev_fault_arg) == 0) + error = EIO; + break; + + case VDEV_FAULT_COUNT: + if ((int64_t)--vd->vdev_fault_arg <= 0) + vd->vdev_fault_mode = VDEV_FAULT_NONE; + error = EIO; + break; + } + + if (error != 0) { + dprintf("returning %d for type %d on %s state %d offset %llx\n", + error, zio->io_type, vdev_description(vd), + vd->vdev_state, zio->io_offset); + } + + return (error); +} + +/* + * Get statistics for the given vdev. + */ +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int c, t; + + mutex_enter(&vd->vdev_stat_lock); + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + mutex_exit(&vd->vdev_stat_lock); + + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + + mutex_enter(&vd->vdev_stat_lock); + for (t = 0; t < ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + vs->vs_read_errors += cvs->vs_read_errors; + vs->vs_write_errors += cvs->vs_write_errors; + vs->vs_checksum_errors += cvs->vs_checksum_errors; + vs->vs_scrub_examined += cvs->vs_scrub_examined; + vs->vs_scrub_errors += cvs->vs_scrub_errors; + mutex_exit(&vd->vdev_stat_lock); + } + } +} + +void +vdev_stat_update(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *pvd; + uint64_t txg = zio->io_txg; + vdev_stat_t *vs = &vd->vdev_stat; + zio_type_t type = zio->io_type; + int flags = zio->io_flags; + + if (zio->io_error == 0) { + if (!(flags & ZIO_FLAG_IO_BYPASS)) { + mutex_enter(&vd->vdev_stat_lock); + vs->vs_ops[type]++; + vs->vs_bytes[type] += zio->io_size; + mutex_exit(&vd->vdev_stat_lock); + } + if ((flags & ZIO_FLAG_IO_REPAIR) && + zio->io_delegate_list == NULL) { + mutex_enter(&vd->vdev_stat_lock); + if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + vs->vs_scrub_repaired += zio->io_size; + else + vs->vs_self_healed += zio->io_size; + mutex_exit(&vd->vdev_stat_lock); + } + return; + } + + if (flags & ZIO_FLAG_SPECULATIVE) + return; + + if (!vdev_is_dead(vd)) { + mutex_enter(&vd->vdev_stat_lock); + if (type == ZIO_TYPE_READ) { + if (zio->io_error == ECKSUM) + vs->vs_checksum_errors++; + else + vs->vs_read_errors++; + } + if (type == ZIO_TYPE_WRITE) + vs->vs_write_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (type == ZIO_TYPE_WRITE) { + if (txg == 0 || vd->vdev_children != 0) + return; + if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); + } + if (!(flags & ZIO_FLAG_IO_REPAIR)) { + vdev_t *tvd = vd->vdev_top; + if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + return; + vdev_dirty(tvd, VDD_DTL, txg); + (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); + for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + } + } +} + +void +vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) +{ + int c; + vdev_stat_t *vs = &vd->vdev_stat; + + for (c = 0; c < vd->vdev_children; c++) + vdev_scrub_stat_update(vd->vdev_child[c], type, complete); + + mutex_enter(&vd->vdev_stat_lock); + + if (type == POOL_SCRUB_NONE) { + /* + * Update completion and end time. Leave everything else alone + * so we can report what happened during the previous scrub. + */ + vs->vs_scrub_complete = complete; + vs->vs_scrub_end = gethrestime_sec(); + } else { + vs->vs_scrub_type = type; + vs->vs_scrub_complete = 0; + vs->vs_scrub_examined = 0; + vs->vs_scrub_repaired = 0; + vs->vs_scrub_errors = 0; + vs->vs_scrub_start = gethrestime_sec(); + vs->vs_scrub_end = 0; + } + + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Report checksum errors that a vdev that didn't realize it made. + * This can happen, for example, when RAID-Z combinatorial reconstruction + * infers that one of its components returned bad data. + */ +void +vdev_checksum_error(zio_t *zio, vdev_t *vd) +{ + dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", + vdev_description(vd)); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } +} + +/* + * Update the in-core space usage stats for this vdev and the root vdev. + */ +void +vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta) +{ + ASSERT(vd == vd->vdev_top); + + do { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_alloc += alloc_delta; + mutex_exit(&vd->vdev_stat_lock); + } while ((vd = vd->vdev_parent) != NULL); +} + +/* + * Various knobs to tune a vdev. + */ +static vdev_knob_t vdev_knob[] = { + { + "cache_size", + "size of the read-ahead cache", + 0, + 1ULL << 30, + 10ULL << 20, + offsetof(struct vdev, vdev_cache.vc_size) + }, + { + "cache_bshift", + "log2 of cache blocksize", + SPA_MINBLOCKSHIFT, + SPA_MAXBLOCKSHIFT, + 16, + offsetof(struct vdev, vdev_cache.vc_bshift) + }, + { + "cache_max", + "largest block size to cache", + 0, + SPA_MAXBLOCKSIZE, + 1ULL << 14, + offsetof(struct vdev, vdev_cache.vc_max) + }, + { + "min_pending", + "minimum pending I/Os to the disk", + 1, + 10000, + 2, + offsetof(struct vdev, vdev_queue.vq_min_pending) + }, + { + "max_pending", + "maximum pending I/Os to the disk", + 1, + 10000, + 35, + offsetof(struct vdev, vdev_queue.vq_max_pending) + }, + { + "agg_limit", + "maximum size of aggregated I/Os", + 0, + SPA_MAXBLOCKSIZE, + SPA_MAXBLOCKSIZE, + offsetof(struct vdev, vdev_queue.vq_agg_limit) + }, + { + "time_shift", + "deadline = pri + (lbolt >> time_shift)", + 0, + 63, + 4, + offsetof(struct vdev, vdev_queue.vq_time_shift) + }, + { + "ramp_rate", + "exponential I/O issue ramp-up rate", + 1, + 10000, + 2, + offsetof(struct vdev, vdev_queue.vq_ramp_rate) + }, +}; + +vdev_knob_t * +vdev_knob_next(vdev_knob_t *vk) +{ + if (vk == NULL) + return (vdev_knob); + + if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t)) + return (NULL); + + return (vk); +} + +/* + * Mark a top-level vdev's config as dirty, placing it on the dirty list + * so that it will be written out next time the vdev configuration is synced. + * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. + */ +void +vdev_config_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int c; + + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_config_dirty(rvd->vdev_child[c]); + } else { + ASSERT(vd == vd->vdev_top); + + if (!vd->vdev_is_dirty) { + list_insert_head(&spa->spa_dirty_list, vd); + vd->vdev_is_dirty = B_TRUE; + } + } +} + +void +vdev_config_clean(vdev_t *vd) +{ + ASSERT(vd->vdev_is_dirty); + + list_remove(&vd->vdev_spa->spa_dirty_list, vd); + vd->vdev_is_dirty = B_FALSE; +} + +/* + * Set a vdev's state, updating any parent's state as well. + */ +void +vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux) +{ + if (state == vd->vdev_state) + return; + + vd->vdev_state = state; + vd->vdev_stat.vs_aux = aux; + + if (vd->vdev_parent != NULL) { + int c; + int degraded = 0, faulted = 0; + vdev_t *parent, *child; + + parent = vd->vdev_parent; + for (c = 0; c < parent->vdev_children; c++) { + child = parent->vdev_child[c]; + if (child->vdev_state <= VDEV_STATE_CANT_OPEN) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; + } + + vd->vdev_parent->vdev_ops->vdev_op_state_change( + vd->vdev_parent, faulted, degraded); + } +} diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c new file mode 100644 index 0000000000..e1e7c1a36f --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -0,0 +1,374 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 256 back-to-back 512-byte + * reads into a single 128k read followed by 255 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 128k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. It could also + * take advantage of semantic information about the I/O. And it could use + * something faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds vc_size. + */ + +static int +vdev_cache_offset_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_offset < ve2->ve_offset) + return (-1); + if (ve1->ve_offset > ve2->ve_offset) + return (1); + return (0); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = a1; + const vdev_cache_entry_t *ve2 = a2; + + if (ve1->ve_lastused < ve2->ve_lastused) + return (-1); + if (ve1->ve_lastused > ve2->ve_lastused) + return (1); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + ASSERT(ve->ve_data != NULL); + + dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", + vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused, + ve->ve_hits, ve->ve_missed_update); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + zio_buf_free(ve->ve_data, vc->vc_blocksize); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (vc->vc_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) > + vc->vc_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) { + dprintf("can't evict in %p, still filling\n", vc); + return (NULL); + } + ASSERT(ve->ve_hits != 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = lbolt; + ve->ve_data = zio_buf_alloc(vc->vc_blocksize); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT(ve->ve_fill_io == NULL); + + if (ve->ve_lastused != lbolt) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = lbolt; + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = zio->io_private; + zio_t *dio; + + ASSERT(zio->io_size == vc->vc_blocksize); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT(ve->ve_fill_io == zio); + ASSERT(ve->ve_offset == zio->io_offset); + ASSERT(ve->ve_data == zio->io_data); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) + vdev_cache_hit(vc, ve, dio); + + if (zio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); + + while ((dio = zio->io_delegate_list) != NULL) { + zio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = zio->io_error; + zio_next_stage(dio); + } +} + +/* + * Read data from the cache. Returns 0 on cache hit, errno on a miss. + */ +int +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize); + uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize); + zio_t *fio; + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (EINVAL); + + if (zio->io_size > vc->vc_max) + return (EOVERFLOW); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, + vc->vc_blocksize)) + return (EXDEV); + + ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (ESTALE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio->io_delegate_next = fio->io_delegate_list; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + mutex_exit(&vc->vc_lock); + return (0); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_next_stage(zio); + return (0); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (ENOMEM); + } + + fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, + ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ, + ZIO_PRIORITY_CACHE_FILL, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, + vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + fio->io_delegate_list = zio; + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + + return (0); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize); + uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize); + avl_index_t where; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + bcopy((char *)zio->io_data + start - io_start, + ve->ve_data + start - ve->ve_offset, end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); + + vc->vc_blocksize = 1ULL << vc->vc_bshift; +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c new file mode 100644 index 0000000000..9255ecf03e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -0,0 +1,307 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/sunddi.h> + +/* + * Virtual device vector for disks. + */ + +extern ldi_ident_t zfs_li; + +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_disk_t *dvd; + int error; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + /* + * When opening a disk device, we want to preserve the user's original + * intent. We always want to open the device by the path the user gave + * us, even if it is one of multiple paths to the save device. But we + * also want to be able to survive disks being removed/recabled. + * Therefore the sequence of opening devices is: + * + * 1. Try opening the device by path. + * + * a. First append "s0" to see if this is a whole disk + * b. Fall back to path otherwise + * + * 2. If the devid of the device matches the stored value, return + * success. + * + * 3. Otherwise, the device may have moved. Try opening the device + * by the devid instead. + * + */ + if (vd->vdev_devid != NULL) { + if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, + &dvd->vd_minor) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + } + + error = EINVAL; /* presume failure */ + + if (vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path) + 3; + char *buf = kmem_alloc(len, KM_SLEEP); + ddi_devid_t devid; + + (void) snprintf(buf, len, "%ss0", vd->vdev_path); + + /* + * Try whole disk first, then slice name. + */ + if ((error = ldi_open_by_name(buf, spa_mode, kcred, + &dvd->vd_lh, zfs_li)) != 0) + error = ldi_open_by_name(vd->vdev_path, + spa_mode, kcred, &dvd->vd_lh, zfs_li); + + kmem_free(buf, len); + + /* + * Compare the devid to the stored value. + */ + if (error == 0 && vd->vdev_devid != NULL && + ldi_get_devid(dvd->vd_lh, &devid) == 0) { + if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { + error = EINVAL; + (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + dvd->vd_lh = NULL; + } + ddi_devid_free(devid); + } + } + + /* + * If we were unable to open by path, or the devid check fails, open by + * devid instead. + */ + if (error != 0 && vd->vdev_devid != NULL) + error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, + spa_mode, kcred, &dvd->vd_lh, zfs_li); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + /* + * Determine the actual size of the device. + */ + if (ldi_get_size(dvd->vd_lh, psize) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (EINVAL); + } + + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (dvd == NULL) + return; + + dprintf("removing disk %s, devid %s\n", + vd->vdev_path ? vd->vdev_path : "<none>", + vd->vdev_devid ? vd->vdev_devid : "<none>"); + + if (dvd->vd_minor != NULL) + ddi_devid_str_free(dvd->vd_minor); + + if (dvd->vd_devid != NULL) + ddi_devid_free(dvd->vd_devid); + + if (dvd->vd_lh != NULL) + (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_disk_io_intr(buf_t *bp) +{ + vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; + zio_t *zio = vdb->vdb_io; + + if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0) + zio->io_error = EIO; + + kmem_free(vdb, sizeof (vdev_disk_buf_t)); + + zio_next_stage_async(zio); +} + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_next_stage_async(zio); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_buf_t *vdb; + buf_t *bp; + int flags, error; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (vdev_is_dead(vd)) { + zio->io_error = ENXIO; + zio_next_stage_async(zio); + return; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; + zio->io_dk_callback.dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)&zio->io_dk_callback, + FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return; + } + zio->io_error = error; + break; + + default: + zio->io_error = ENOTSUP; + } + + zio_next_stage_async(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return; + + if ((zio = vdev_queue_io(zio)) == NULL) + return; + + flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + flags |= B_BUSY | B_NOCACHE; + if (zio->io_flags & ZIO_FLAG_FAILFAST) + flags |= B_FAILFAST; + + vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + + vdb->vdb_io = zio; + bp = &vdb->vdb_buf; + + bioinit(bp); + bp->b_flags = flags; + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_iodone = (int (*)())vdev_disk_io_intr; + + /* XXPOLICY */ + error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (error) { + zio->io_error = error; + bioerror(bp, error); + bp->b_resid = bp->b_bcount; + bp->b_iodone(bp); + return; + } + + error = ldi_strategy(dvd->vd_lh, bp); + /* ldi_strategy() will return non-zero only on programming errors */ + ASSERT(error == 0); +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + zio_next_stage(zio); +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c new file mode 100644 index 0000000000..a789008e17 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -0,0 +1,223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for files. + */ + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_file_t *vf; + vnode_t *vp; + vattr_t vattr; + int error; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + +#ifdef _KERNEL + /* + * When using a file vdev in kernel context, the underlying filesystem + * will already be caching the data. Don't cache it again here. + */ + vd->vdev_cache.vc_size = 0; +#endif + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX, + 0, &vp, 0, 0, rootdir); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_vnode = vp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (vp->v_type != VREG) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (ENODEV); + } +#endif + + /* + * Determine the physical size of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, kcred); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *psize = vattr.va_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vf == NULL) + return; + + if (vf->vf_vnode != NULL) { + (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred); + VN_RELE(vf->vf_vnode); + } + + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + int error; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (vdev_is_dead(vd)) { + zio->io_error = ENXIO; + zio_next_stage_async(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred); + dprintf("fsync(%s) = %d\n", vdev_description(vd), + zio->io_error); + break; + default: + zio->io_error = ENOTSUP; + } + + zio_next_stage_async(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return; + + if ((zio = vdev_queue_io(zio)) == NULL) + return; + + /* XXPOLICY */ + error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); + if (error) { + zio->io_error = error; + zio_next_stage_async(zio); + return; + } + + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? + UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + zio->io_size, zio->io_offset, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid); + + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + zio_next_stage_async(zio); +} + +static void +vdev_file_io_done(zio_t *zio) +{ + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + zio_next_stage(zio); +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c new file mode 100644 index 0000000000..6671a68fa9 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -0,0 +1,848 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Virtual Device Labels + * --------------------- + * + * The vdev label serves several distinct purposes: + * + * 1. Uniquely identify this device as part of a ZFS pool and confirm its + * identity within the pool. + * + * 2. Verify that all the devices given in a configuration are present + * within the pool. + * + * 3. Determine the uberblock for the pool. + * + * 4. In case of an import operation, determine the configuration of the + * toplevel vdev of which it is a part. + * + * 5. If an import operation cannot find all the devices in the pool, + * provide enough information to the administrator to determine which + * devices are missing. + * + * It is important to note that while the kernel is responsible for writing the + * label, it only consumes the information in the first three cases. The + * latter information is only consumed in userland when determining the + * configuration to import a pool. + * + * + * Label Organization + * ------------------ + * + * Before describing the contents of the label, it's important to understand how + * the labels are written and updated with respect to the uberblock. + * + * When the pool configuration is altered, either because it was newly created + * or a device was added, we want to update all the labels such that we can deal + * with fatal failure at any point. To this end, each disk has two labels which + * are updated before and after the uberblock is synced. Assuming we have + * labels and an uberblock with the following transacation groups: + * + * L1 UB L2 + * +------+ +------+ +------+ + * | | | | | | + * | t10 | | t10 | | t10 | + * | | | | | | + * +------+ +------+ +------+ + * + * In this stable state, the labels and the uberblock were all updated within + * the same transaction group (10). Each label is mirrored and checksummed, so + * that we can detect when we fail partway through writing the label. + * + * In order to identify which labels are valid, the labels are written in the + * following manner: + * + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label + * + * Given arbitrary failure, we can determine the correct label to use based on + * the transaction group. If we fail after updating L1 but before updating the + * UB, we will notice that L1's transaction group is greater than the uberblock, + * so L2 must be valid. If we fail after writing the uberblock but before + * writing L2, we will notice that L2's transaction group is less than L1, and + * therefore L1 is valid. + * + * Another added complexity is that not every label is updated when the config + * is synced. If we add a single device, we do not want to have to re-write + * every label for every device in the pool. This means that both L1 and L2 may + * be older than the pool uberblock, because the necessary information is stored + * on another vdev. + * + * + * On-disk Format + * -------------- + * + * The vdev label consists of two distinct parts, and is wrapped within the + * vdev_label_t structure. The label includes 8k of padding to permit legacy + * VTOC disk labels, but is otherwise ignored. + * + * The first half of the label is a packed nvlist which contains pool wide + * properties, per-vdev properties, and configuration information. It is + * described in more detail below. + * + * The latter half of the label consists of a redundant array of uberblocks. + * These uberblocks are updated whenever a transaction group is committed, + * or when the configuration is updated. When a pool is loaded, we scan each + * vdev for the 'best' uberblock. + * + * + * Configuration Information + * ------------------------- + * + * The nvlist describing the pool and vdev contains the following elements: + * + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. + * + * Each leaf device label also contains the following: + * + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev + * + * The 'vs' configuration follows the format described in 'spa_config.c'. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Basic routines to read and write from a vdev label. + * Used throughout the rest of this file. + */ +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); +} + +static void +vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private) +{ + ASSERT(vd->vdev_children == 0); + + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY)); +} + +static void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private) +{ + ASSERT(vd->vdev_children == 0); + + zio_nowait(zio_write_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY)); +} + +/* + * Generate the nvlist representing this vdev's config. + */ +nvlist_t * +vdev_config_generate(vdev_t *vd, int getstats) +{ + nvlist_t *nv = NULL; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); + + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + vd->vdev_ops->vdev_op_type) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + + if (vd->vdev_path != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + + if (vd->vdev_devid != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + + if (vd == vd->vdev_top) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize) == 0); + } + + if (vd->vdev_dtl.smo_object != 0) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + vd->vdev_dtl.smo_object) == 0); + + if (getstats) { + vdev_stat_t vs; + vdev_get_stats(vd, &vs); + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + } + + if (!vd->vdev_ops->vdev_op_leaf) { + nvlist_t **child; + int c; + + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + + for (c = 0; c < vd->vdev_children; c++) + child[c] = vdev_config_generate(vd->vdev_child[c], + getstats); + + VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, vd->vdev_children) == 0); + + for (c = 0; c < vd->vdev_children; c++) + nvlist_free(child[c]); + + kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); + } + + return (nv); +} + +nvlist_t * +vdev_label_read_config(vdev_t *vd) +{ + nvlist_t *config = NULL; + vdev_phys_t *vp; + uint64_t version; + zio_t *zio; + int l; + + if (vdev_is_dead(vd)) + return (NULL); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + + for (l = 0; l < VDEV_LABELS; l++) { + + zio = zio_root(vd->vdev_spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD); + + vdev_label_read(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL); + + if (zio_wait(zio) == 0 && + nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + &config, 0) == 0 && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0 && + version == UBERBLOCK_VERSION) + break; + + if (config != NULL) { + nvlist_free(config); + config = NULL; + } + } + + zio_buf_free(vp, sizeof (vdev_phys_t)); + + return (config); +} + +int +vdev_label_init(vdev_t *vd, uint64_t crtxg) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + vdev_phys_t *vp; + vdev_boot_header_t *vb; + uberblock_phys_t *ubphys; + zio_t *zio; + int l, c, n; + char *buf; + size_t buflen; + int error; + + for (c = 0; c < vd->vdev_children; c++) + if ((error = vdev_label_init(vd->vdev_child[c], crtxg)) != 0) + return (error); + + if (!vd->vdev_ops->vdev_op_leaf) + return (0); + + /* + * Make sure each leaf device is writable, and zero its initial content. + * Along the way, also make sure that no leaf is already in use. + * Note that it's important to do this sequentially, not in parallel, + * so that we catch cases of multiple use of the same leaf vdev in + * the vdev we're creating -- e.g. mirroring a disk with itself. + */ + if (vdev_is_dead(vd)) + return (EIO); + + /* + * Check whether this device is already in use. + * Ignore the check if crtxg == 0, which we use for device removal. + */ + if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) { + uint64_t version, state, pool_guid, device_guid, txg; + uint64_t mycrtxg = 0; + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &mycrtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, + &version) == 0 && version == UBERBLOCK_VERSION && + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && state == POOL_STATE_ACTIVE && + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) == 0 && + spa_guid_exists(pool_guid, device_guid) && + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) == 0 && (txg != 0 || mycrtxg == crtxg)) { + dprintf("vdev %s in use, pool_state %d\n", + vdev_description(vd), state); + nvlist_free(label); + return (EBUSY); + } + nvlist_free(label); + } + + /* + * The device isn't in use, so initialize its label. + */ + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + /* + * Generate a label describing the pool and our top-level vdev. + * We mark it as being from txg 0 to indicate that it's not + * really part of an active pool just yet. The labels will + * be written again with a meaningful txg by spa_sync(). + */ + label = spa_config_generate(spa, vd, 0ULL, 0); + + /* + * Add our creation time. This allows us to detect multiple vdev + * uses as described above, and automatically expires if we fail. + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + nvlist_free(label); + zio_buf_free(vp, sizeof (vdev_phys_t)); + return (EINVAL); + } + + /* + * Initialize boot block header. + */ + vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); + bzero(vb, sizeof (vdev_boot_header_t)); + vb->vb_magic = VDEV_BOOT_MAGIC; + vb->vb_version = VDEV_BOOT_VERSION; + vb->vb_offset = VDEV_BOOT_OFFSET; + vb->vb_size = VDEV_BOOT_SIZE; + + /* + * Initialize uberblock template. + */ + ubphys = zio_buf_alloc(sizeof (uberblock_phys_t)); + bzero(ubphys, sizeof (uberblock_phys_t)); + ubphys->ubp_uberblock = spa->spa_uberblock; + ubphys->ubp_uberblock.ub_txg = 0; + + /* + * Write everything in parallel. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + for (l = 0; l < VDEV_LABELS; l++) { + + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL); + + vdev_label_write(zio, vd, l, vb, + offsetof(vdev_label_t, vl_boot_header), + sizeof (vdev_boot_header_t), NULL, NULL); + + for (n = 0; n < VDEV_UBERBLOCKS; n++) { + + vdev_label_write(zio, vd, l, ubphys, + offsetof(vdev_label_t, vl_uberblock[n]), + sizeof (uberblock_phys_t), NULL, NULL); + + } + } + + error = zio_wait(zio); + + nvlist_free(label); + zio_buf_free(ubphys, sizeof (uberblock_phys_t)); + zio_buf_free(vb, sizeof (vdev_boot_header_t)); + zio_buf_free(vp, sizeof (vdev_phys_t)); + + return (error); +} + +/* + * ========================================================================== + * uberblock load/sync + * ========================================================================== + */ + +/* + * Consider the following situation: txg is safely synced to disk. We've + * written the first uberblock for txg + 1, and then we lose power. When we + * come back up, we fail to see the uberblock for txg + 1 because, say, + * it was on a mirrored device and the replica to which we wrote txg + 1 + * is now offline. If we then make some changes and sync txg + 1, and then + * the missing replica comes back, then for a new seconds we'll have two + * conflicting uberblocks on disk with the same txg. The solution is simple: + * among uberblocks with equal txg, choose the one with the latest timestamp. + */ +static int +vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +{ + if (ub1->ub_txg < ub2->ub_txg) + return (-1); + if (ub1->ub_txg > ub2->ub_txg) + return (1); + + if (ub1->ub_timestamp < ub2->ub_timestamp) + return (-1); + if (ub1->ub_timestamp > ub2->ub_timestamp) + return (1); + + return (0); +} + +static void +vdev_uberblock_load_done(zio_t *zio) +{ + uberblock_phys_t *ubphys = zio->io_data; + uberblock_t *ub = &ubphys->ubp_uberblock; + uberblock_t *ubbest = zio->io_private; + spa_t *spa = zio->io_spa; + + ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t)); + + if (uberblock_verify(ub) == 0) { + mutex_enter(&spa->spa_uberblock_lock); + if (vdev_uberblock_compare(ub, ubbest) > 0) + *ubbest = *ub; + mutex_exit(&spa->spa_uberblock_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); +} + +void +vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +{ + int l, c, n; + + for (c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + for (l = 0; l < VDEV_LABELS; l++) { + for (n = 0; n < VDEV_UBERBLOCKS; n++) { + vdev_label_read(zio, vd, l, + zio_buf_alloc(sizeof (uberblock_phys_t)), + offsetof(vdev_label_t, vl_uberblock[n]), + sizeof (uberblock_phys_t), + vdev_uberblock_load_done, ubbest); + } + } +} + +/* + * Write the uberblock to both labels of all leaves of the specified vdev. + */ +static void +vdev_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_root->io_private; + + if (zio->io_error == 0) + atomic_add_64(good_writes, 1); +} + +static void +vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd, + uint64_t txg) +{ + int l, c, n; + + for (c = 0; c < vd->vdev_children; c++) + vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + n = txg & (VDEV_UBERBLOCKS - 1); + + ASSERT(ubphys->ubp_uberblock.ub_txg == txg); + + for (l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ubphys, + offsetof(vdev_label_t, vl_uberblock[n]), + sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL); + + dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg); +} + +static int +vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg) +{ + uberblock_phys_t *ubphys; + uint64_t *good_writes; + zio_t *zio; + int error; + + ubphys = zio_buf_alloc(sizeof (uberblock_phys_t)); + bzero(ubphys, sizeof (uberblock_phys_t)); + ubphys->ubp_uberblock = *ub; + + good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + + zio = zio_root(spa, NULL, good_writes, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + vdev_uberblock_sync(zio, ubphys, uvd, txg); + + error = zio_wait(zio); + + if (error && *good_writes != 0) { + dprintf("partial success: good_writes = %llu\n", *good_writes); + error = 0; + } + + /* + * It's possible to have no good writes and no error if every vdev is in + * the CANT_OPEN state. + */ + if (*good_writes == 0 && error == 0) + error = EIO; + + kmem_free(good_writes, sizeof (uint64_t)); + zio_buf_free(ubphys, sizeof (uberblock_phys_t)); + + return (error); +} + +/* + * Sync out an individual vdev. + */ +static void +vdev_sync_label_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_root->io_private; + + if (zio->io_error == 0) + atomic_add_64(good_writes, 1); +} + +static void +vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) +{ + nvlist_t *label; + vdev_phys_t *vp; + char *buf; + size_t buflen; + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_sync_label(zio, vd->vdev_child[c], l, txg); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (vdev_is_dead(vd)) + return; + + /* + * Generate a label describing the top-level config to which we belong. + */ + label = spa_config_generate(vd->vdev_spa, vd, txg, 0); + + vp = zio_buf_alloc(sizeof (vdev_phys_t)); + bzero(vp, sizeof (vdev_phys_t)); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0) + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), + vdev_sync_label_done, NULL); + + zio_buf_free(vp, sizeof (vdev_phys_t)); + nvlist_free(label); + + dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg); +} + +static int +vdev_sync_labels(vdev_t *vd, int l, uint64_t txg) +{ + uint64_t *good_writes; + zio_t *zio; + int error; + + ASSERT(vd == vd->vdev_top); + + good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + + zio = zio_root(vd->vdev_spa, NULL, good_writes, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + + /* + * Recursively kick off writes to all labels. + */ + vdev_sync_label(zio, vd, l, txg); + + error = zio_wait(zio); + + if (error && *good_writes != 0) { + dprintf("partial success: good_writes = %llu\n", *good_writes); + error = 0; + } + + if (*good_writes == 0 && error == 0) + error = ENODEV; + + kmem_free(good_writes, sizeof (uint64_t)); + + return (error); +} + +/* + * Sync the entire vdev configuration. + * + * The order of operations is carefully crafted to ensure that + * if the system panics or loses power at any time, the state on disk + * is still transactionally consistent. The in-line comments below + * describe the failure semantics at each stage. + * + * Moreover, it is designed to be idempotent: if spa_sync_labels() fails + * at any time, you can just call it again, and it will resume its work. + */ +int +spa_sync_labels(spa_t *spa, uint64_t txg) +{ + uberblock_t *ub = &spa->spa_uberblock; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *uvd; + zio_t *zio; + int c, l, error; + + ASSERT(ub->ub_txg <= txg); + + /* + * If this isn't a resync due to I/O errors, and nothing changed + * in this transaction group, and the vdev configuration hasn't changed, + * and this isn't an explicit sync-all, then there's nothing to do. + */ + if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE && + list_is_empty(&spa->spa_dirty_list)) { + dprintf("nothing to sync in %s in txg %llu\n", + spa_name(spa), txg); + return (0); + } + + if (txg > spa_freeze_txg(spa)) + return (0); + + dprintf("syncing %s txg %llu\n", spa_name(spa), txg); + + /* + * Flush the write cache of every disk that's been written to + * in this transaction group. This ensures that all blocks + * written in this txg will be committed to stable storage + * before any uberblock that references them. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + /* + * Sync out the even labels (L0, L2) for every dirty vdev. If the + * system dies in the middle of this process, that's OK: all of the + * even labels that made it to disk will be newer than any uberblock, + * and will therefore be considered invalid. The odd labels (L1, L3), + * which have not yet been touched, will still be valid. + */ + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + for (l = 0; l < VDEV_LABELS; l++) { + if (l & 1) + continue; + if ((error = vdev_sync_labels(vd, l, txg)) != 0) + return (error); + } + } + + /* + * Flush the new labels to disk. This ensures that all even-label + * updates are committed to stable storage before the uberblock update. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + /* + * If there are any dirty vdevs, sync the uberblock to all vdevs. + * Otherwise, pick one top-level vdev at random. + */ + if (!list_is_empty(&spa->spa_dirty_list)) + uvd = rvd; + else + uvd = rvd->vdev_child[spa_get_random(rvd->vdev_children)]; + + /* + * Sync the uberblocks. If the system dies in the middle of this + * step, there are two cases to consider, and the on-disk state + * is consistent either way: + * + * (1) If none of the new uberblocks made it to disk, then the + * previous uberblock will be the newest, and the odd labels + * (which had not yet been touched) will be valid with respect + * to that uberblock. + * + * (2) If one or more new uberblocks made it to disk, then they + * will be the newest, and the even labels (which had all + * been successfully committed) will be valid with respect + * to the new uberblocks. + */ + if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0) + return (error); + + /* + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + + /* + * Sync out odd labels for every dirty vdev. If the system dies + * in the middle of this process, the even labels and the new + * uberblocks will suffice to open the pool. The next time + * the pool is opened, the first thing we'll do -- before any + * user data is modified -- is mark every vdev dirty so that + * all labels will be brought up to date. + */ + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + for (l = 0; l < VDEV_LABELS; l++) { + if ((l & 1) == 0) + continue; + if ((error = vdev_sync_labels(vd, l, txg)) != 0) + return (error); + } + } + + /* + * Flush the new labels to disk. This ensures that all odd-label + * updates are committed to stable storage before the next + * transaction group begins. + */ + zio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (vd = list_head(&spa->spa_dirty_list); vd != NULL; + vd = list_next(&spa->spa_dirty_list, vd)) { + zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + } + (void) zio_wait(zio); + + /* + * Clear the dirty list. + */ + while (!list_is_empty(&spa->spa_dirty_list)) + vdev_config_clean(list_head(&spa->spa_dirty_list)); + +#ifdef DEBUG + for (c = 0; c < rvd->vdev_children; c++) { + ASSERT(rvd->vdev_child[c]->vdev_is_dirty == 0); + } +#endif + + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c new file mode 100644 index 0000000000..45eb7ce78b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -0,0 +1,414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for mirroring. + */ + +typedef struct mirror_map { + int mm_error; + short mm_tried; + short mm_skipped; +} mirror_map_t; + +static mirror_map_t * +vdev_mirror_map_alloc(zio_t *zio) +{ + zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children * + sizeof (mirror_map_t), KM_SLEEP); + return (zio->io_vsd); +} + +static void +vdev_mirror_map_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, + zio->io_vd->vdev_children * sizeof (mirror_map_t)); + zio->io_vsd = NULL; +} + +static int +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + uint64_t c; + int numerrors = 0; + int ret, lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((ret = vdev_open(cvd)) != 0) { + lasterror = ret; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = cvd->vdev_ashift; + } + + if (numerrors == vd->vdev_children) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_mirror_close(vdev_t *vd) +{ + uint64_t c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_mirror_child_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_private; + + mm->mm_error = zio->io_error; + mm->mm_tried = 1; + mm->mm_skipped = 0; +} + +static void +vdev_mirror_scrub_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_private; + + if (zio->io_error == 0) { + zio_t *pio = zio->io_parent; + mutex_enter(&pio->io_lock); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + + zio_buf_free(zio->io_data, zio->io_size); + + mm->mm_error = zio->io_error; + mm->mm_tried = 1; + mm->mm_skipped = 0; +} + +/* + * Try to find a child whose DTL doesn't contain the block we want to read. + * If we can't, try the read on any vdev we haven't already tried. + */ +static int +vdev_mirror_child_select(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + uint64_t txg = zio->io_txg; + int i, c; + + ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + + /* + * Select the child we'd like to read from absent any errors. + * The current policy is to alternate sides at 8M granularity. + * XXX -- investigate other policies for read distribution. + */ + c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children; + + /* + * If this is a replacing vdev, always try child 0 (the source) first. + */ + if (vd->vdev_ops == &vdev_replacing_ops) + c = 0; + + /* + * Try to find a child whose DTL doesn't contain the block to read. + * If a child is known to be completely inaccessible (indicated by + * vdev_is_dead() returning B_TRUE), don't even try. + */ + for (i = 0; i < vd->vdev_children; i++, c++) { + if (c >= vd->vdev_children) + c = 0; + if (mm[c].mm_tried || mm[c].mm_skipped) + continue; + cvd = vd->vdev_child[c]; + if (vdev_is_dead(cvd)) { + mm[c].mm_error = ENXIO; + mm[c].mm_tried = 1; /* don't even try */ + mm[c].mm_skipped = 1; + continue; + } + if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1)) + return (c); + mm[c].mm_error = ESTALE; + mm[c].mm_skipped = 1; + } + + /* + * Every device is either missing or has this txg in its DTL. + * If we don't have any sibling replicas to consult, look for + * any child we haven't already tried before giving up. + */ + if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) { + for (c = 0; c < vd->vdev_children; c++) { + if (!mm[c].mm_tried) + return (c); + } + } + + /* + * Every child failed. There's no place left to look. + */ + return (-1); +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mirror_map_t *mm; + int c, children; + + mm = vdev_mirror_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_flags & ZIO_FLAG_SCRUB) { + /* + * For scrubbing reads we need to allocate a read + * buffer for each child and issue reads to all + * children. If any child succeeds, it will copy its + * data into zio->io_data in vdev_mirror_scrub_done. + */ + for (c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + vd->vdev_child[c], zio->io_offset, + zio_buf_alloc(zio->io_size), zio->io_size, + zio->io_type, zio->io_priority, + ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done, + &mm[c])); + } + zio_wait_children_done(zio); + return; + } + /* + * For normal reads just pick one child. + */ + c = vdev_mirror_child_select(zio); + children = (c >= 0); + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * If this is a resilvering I/O to a replacing vdev, + * only the last child should be written -- unless the + * first child happens to have a DTL entry here as well. + * All other writes go to all children. + */ + if ((zio->io_flags & ZIO_FLAG_RESILVER) && + vd->vdev_ops == &vdev_replacing_ops && + !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map, + zio->io_txg, 1)) { + c = vd->vdev_children - 1; + children = 1; + } else { + c = 0; + children = vd->vdev_children; + } + } + + while (children--) { + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + vd->vdev_child[c], zio->io_offset, zio->io_data, + zio->io_size, zio->io_type, zio->io_priority, + ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c])); + c++; + } + + zio_wait_children_done(zio); +} + +static void +vdev_mirror_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + mirror_map_t *mm = zio->io_vsd; + int c; + int good_copies = 0; + int unexpected_errors = 0; + + ASSERT(mm != NULL); + + zio->io_error = 0; + zio->io_numerrors = 0; + + for (c = 0; c < vd->vdev_children; c++) { + if (mm[c].mm_tried && mm[c].mm_error == 0) { + good_copies++; + continue; + } + + /* + * We preserve any EIOs because those may be worth retrying; + * whereas ECKSUM and ENXIO are more likely to be persistent. + */ + if (mm[c].mm_error) { + if (zio->io_error != EIO) + zio->io_error = mm[c].mm_error; + if (!mm[c].mm_skipped) + unexpected_errors++; + zio->io_numerrors++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as success. + */ + /* XXPOLICY */ + if (good_copies != 0) + zio->io_error = 0; + ASSERT(mm != NULL); + vdev_mirror_map_free(zio); + zio_next_stage(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If we don't have a good copy yet, keep trying other children. + */ + /* XXPOLICY */ + if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { + ASSERT(c >= 0 && c < vd->vdev_children); + cvd = vd->vdev_child[c]; + dprintf("%s: retrying i/o (err=%d) on child %s\n", + vdev_description(zio->io_vd), zio->io_error, + vdev_description(cvd)); + zio->io_error = 0; + zio_vdev_io_redone(zio); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd, + zio->io_offset, zio->io_data, zio->io_size, + ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_mirror_child_done, &mm[c])); + zio_wait_children_done(zio); + return; + } + + /* XXPOLICY */ + if (good_copies) + zio->io_error = 0; + else + ASSERT(zio->io_error != 0); + + if (good_copies && (spa_mode & FWRITE) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < vd->vdev_children; c++) { + /* + * Don't rewrite known good children. + * Not only is it unnecessary, it could + * actually be harmful: if the system lost + * power while rewriting the only good copy, + * there would be no good copies left! + */ + cvd = vd->vdev_child[c]; + + if (mm[c].mm_error == 0) { + if (mm[c].mm_tried) + continue; + if (!vdev_dtl_contains(&cvd->vdev_dtl_map, + zio->io_txg, 1)) + continue; + mm[c].mm_error = ESTALE; + } + + dprintf("%s resilvered %s @ 0x%llx error %d\n", + vdev_description(vd), + vdev_description(cvd), + zio->io_offset, mm[c].mm_error); + + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd, + zio->io_offset, zio->io_data, zio->io_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); + } + } + + vdev_mirror_map_free(zio); + zio_next_stage(zio); +} + +static void +vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted == vd->vdev_children) + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_mirror_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_MIRROR, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_replacing_ops = { + vdev_mirror_open, + vdev_mirror_close, + vdev_default_asize, + vdev_mirror_io_start, + vdev_mirror_io_done, + vdev_mirror_state_change, + VDEV_TYPE_REPLACING, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c new file mode 100644 index 0000000000..b35f4a5bcd --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The 'missing' vdev is a special vdev type used only during import. It + * signifies a placeholder in the root vdev for some vdev that we know is + * missing. We pass it down to the kernel to allow the rest of the + * configuration to parsed and an attempt made to open all available devices. + * Because its GUID is always 0, we know that the guid sum will mismatch and we + * won't be able to open the pool anyway. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> + +/* ARGSUSED */ +static int +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + /* + * Really this should just fail. But then the root vdev will be in the + * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is + * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we + * will fail the GUID sum check before ever trying to open the pool. + */ + *psize = SPA_MINDEVSIZE; + *ashift = SPA_MINBLOCKSHIFT; + return (0); +} + +/* ARGSUSED */ +static void +vdev_missing_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_missing_io_start(zio_t *zio) +{ + zio->io_error = ENOTSUP; + zio_next_stage_async(zio); +} + +/* ARGSUSED */ +static void +vdev_missing_io_done(zio_t *zio) +{ + zio_next_stage(zio); +} + +vdev_ops_t vdev_missing_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + VDEV_TYPE_MISSING, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c new file mode 100644 index 0000000000..09831e1504 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -0,0 +1,286 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/avl.h> + +/* + * Virtual device vector for disk I/O scheduling. + */ +int +vdev_queue_deadline_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_deadline < z2->io_deadline) + return (-1); + if (z1->io_deadline > z2->io_deadline) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +int +vdev_queue_offset_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +void +vdev_queue_init(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, + sizeof (zio_t), offsetof(struct zio, io_deadline_node)); + + avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_offset_node)); +} + +void +vdev_queue_fini(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + avl_destroy(&vq->vq_deadline_tree); + avl_destroy(&vq->vq_read_tree); + avl_destroy(&vq->vq_write_tree); + avl_destroy(&vq->vq_pending_tree); + + mutex_destroy(&vq->vq_lock); +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + zio_t *dio; + uint64_t offset = 0; + + while ((dio = aio->io_delegate_list) != NULL) { + if (aio->io_type == ZIO_TYPE_READ) + bcopy((char *)aio->io_data + offset, dio->io_data, + dio->io_size); + offset += dio->io_size; + aio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + dio->io_error = aio->io_error; + zio_next_stage(dio); + } + ASSERT3U(offset, ==, aio->io_size); + + zio_buf_free(aio->io_data, aio->io_size); +} + +#define IS_ADJACENT(io, nio) \ + ((io)->io_offset + (io)->io_size == (nio)->io_offset) + +typedef void zio_issue_func_t(zio_t *); + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, + zio_issue_func_t **funcp) +{ + zio_t *fio, *lio, *aio, *dio; + avl_tree_t *tree; + uint64_t size; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + *funcp = NULL; + + if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || + avl_numnodes(&vq->vq_deadline_tree) == 0) + return (NULL); + + fio = lio = avl_first(&vq->vq_deadline_tree); + + tree = fio->io_vdev_tree; + size = fio->io_size; + + while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && + size + dio->io_size <= vq->vq_agg_limit) { + dio->io_delegate_next = fio; + fio = dio; + size += dio->io_size; + } + + while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && + size + dio->io_size <= vq->vq_agg_limit) { + lio->io_delegate_next = dio; + lio = dio; + size += dio->io_size; + } + + if (fio != lio) { + char *buf = zio_buf_alloc(size); + uint64_t offset = 0; + int nagg = 0; + + ASSERT(size <= vq->vq_agg_limit); + + aio = zio_vdev_child_io(fio, NULL, fio->io_vd, + fio->io_offset, buf, size, fio->io_type, + ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE, + vdev_queue_agg_io_done, NULL); + + aio->io_delegate_list = fio; + + for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + ASSERT(dio->io_type == aio->io_type); + if (dio->io_type == ZIO_TYPE_WRITE) + bcopy(dio->io_data, buf + offset, dio->io_size); + offset += dio->io_size; + avl_remove(&vq->vq_deadline_tree, dio); + avl_remove(tree, dio); + zio_vdev_io_bypass(dio); + nagg++; + } + + ASSERT(offset == size); + + dprintf("%5s T=%llu off=%8llx agg=%3d " + "old=%5llx new=%5llx\n", + zio_type_name[fio->io_type], + fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); + + avl_add(&vq->vq_pending_tree, aio); + + *funcp = zio_nowait; + return (aio); + } + + avl_remove(&vq->vq_deadline_tree, fio); + avl_remove(tree, fio); + + avl_add(&vq->vq_pending_tree, fio); + + *funcp = zio_next_stage; + + return (fio); +} + +zio_t * +vdev_queue_io(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + zio_issue_func_t *func; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) + return (zio); + + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + + if (zio->io_type == ZIO_TYPE_READ) + zio->io_vdev_tree = &vq->vq_read_tree; + else + zio->io_vdev_tree = &vq->vq_write_tree; + + mutex_enter(&vq->vq_lock); + + zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) + + zio->io_priority; + + avl_add(&vq->vq_deadline_tree, zio); + avl_add(zio->io_vdev_tree, zio); + + nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func); + + mutex_exit(&vq->vq_lock); + + if (nio == NULL || func != zio_nowait) + return (nio); + + func(nio); + return (NULL); +} + +void +vdev_queue_io_done(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + zio_issue_func_t *func; + int i; + + mutex_enter(&vq->vq_lock); + + avl_remove(&vq->vq_pending_tree, zio); + + for (i = 0; i < vq->vq_ramp_rate; i++) { + nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func); + if (nio == NULL) + break; + mutex_exit(&vq->vq_lock); + if (func == zio_next_stage) + zio_vdev_io_reissue(nio); + func(nio); + mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c new file mode 100644 index 0000000000..54547a3c97 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -0,0 +1,599 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for RAID-Z. + */ + +/* + * We currently allow up to two-way replication (i.e. single-fault + * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs + * must all be multiples of two times the leaf vdev blocksize. + */ +#define VDEV_RAIDZ_ALIGN 2ULL + +typedef struct raidz_col { + uint64_t rc_col; + uint64_t rc_offset; + uint64_t rc_size; + void *rc_data; + int rc_error; + short rc_tried; + short rc_skipped; +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; + uint64_t rm_bigcols; + uint64_t rm_asize; + int rm_missing_child; + int rm_type; + int rm_firstdatacol; + raidz_col_t rm_col[1]; +} raidz_map_t; + +#define RAIDZ_SINGLE 0 +#define RAIDZ_PARITY 1 + +static raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, + int raid_type) +{ + raidz_map_t *rm; + uint64_t b = zio->io_offset >> unit_shift; + uint64_t s = zio->io_size >> unit_shift; + uint64_t f = b % dcols; + uint64_t o = (b / dcols) << unit_shift; + uint64_t q, r, c, bc, col, acols, coff; + int firstdatacol; + + switch (raid_type) { + case RAIDZ_SINGLE: + q = s / dcols; + r = s - q * dcols; + bc = r; + firstdatacol = 0; + break; + case RAIDZ_PARITY: + q = s / (dcols - 1); + r = s - q * (dcols - 1); + bc = r + !!r; + firstdatacol = 1; + break; + } + + acols = (q == 0 ? bc : dcols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_bigcols = bc; + rm->rm_asize = 0; + rm->rm_missing_child = -1; + rm->rm_type = raid_type; + rm->rm_firstdatacol = firstdatacol; + + for (c = 0; c < acols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << unit_shift; + } + rm->rm_col[c].rc_col = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; + rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + rm->rm_asize += rm->rm_col[c].rc_size; + } + + rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + + rm->rm_col[c].rc_data = zio->io_data; + + for (c = c + 1; c < acols; c++) + rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + + if (raid_type == RAIDZ_PARITY) { + /* + * To prevent hot parity disks, switch the parity and data + * columns every 1MB. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (zio->io_offset & (1ULL << 20)) { + col = rm->rm_col[0].rc_col; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_col = col; + rm->rm_col[1].rc_offset = o; + } + } + + zio->io_vsd = rm; + return (rm); +} + +static void +vdev_raidz_map_free(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) + zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + zio->io_vsd = NULL; +} + +static void +vdev_raidz_reconstruct(raidz_map_t *rm, int x) +{ + uint64_t *dst, *src, count, xsize, csize; + int i, c; + + for (c = 0; c < rm->rm_cols; c++) { + if (c == x) + continue; + src = rm->rm_col[c].rc_data; + dst = rm->rm_col[x].rc_data; + csize = rm->rm_col[c].rc_size; + xsize = rm->rm_col[x].rc_size; + count = MIN(csize, xsize) / sizeof (uint64_t); + if (c == !x) { + /* + * The initial copy happens at either c == 0 or c == 1. + * Both of these columns are 'big' columns, so we'll + * definitely initialize all of column x. + */ + ASSERT3U(xsize, <=, csize); + for (i = 0; i < count; i++) + *dst++ = *src++; + } else { + for (i = 0; i < count; i++) + *dst++ ^= *src++; + } + } +} + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + int c, error; + int lasterror = 0; + int numerrors = 0; + + /* + * XXX -- minimum children should be raid-type-specific + */ + if (vd->vdev_children < 2) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((error = vdev_open(cvd)) != 0) { + lasterror = error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *ashift = cvd->vdev_ashift; + } + + *asize *= vd->vdev_children; + + if (numerrors > 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t cols = vd->vdev_children; + + /* + * These calculations assume RAIDZ_PARITY. + */ + asize = psize >> vd->vdev_ashift; + asize += (asize + cols - 2) / (cols - 1); + asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static void +vdev_raidz_repair_done(zio_t *zio) +{ + zio_buf_free(zio->io_data, zio->io_size); +} + +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + blkptr_t *bp = zio->io_bp; + raidz_map_t *rm; + raidz_col_t *rc; + int c; + + rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children, + RAIDZ_PARITY); + + if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { + ASSERT3U(rm->rm_asize, ==, + vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); + ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + } else { + ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + + /* + * Generate RAID parity in virtual column 0. + */ + vdev_raidz_reconstruct(rm, 0); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_col]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } + zio_wait_children_done(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_col]; + if (vdev_is_dead(cvd)) { + rm->rm_missing_child = c; + rc->rc_error = ENXIO; + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + rm->rm_missing_child = c; + rc->rc_error = ESTALE; + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || + (zio->io_flags & ZIO_FLAG_SCRUB)) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } + } + + zio_wait_children_done(zio); +} + +static void +vdev_raidz_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + blkptr_t *bp = zio->io_bp; + int unexpected_errors = 0; + int c; + + ASSERT(bp != NULL); /* XXX need to add code to enforce this */ + + zio->io_error = 0; + zio->io_numerrors = 0; + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + /* + * We preserve any EIOs because those may be worth retrying; + * whereas ECKSUM and ENXIO are more likely to be persistent. + */ + if (rc->rc_error) { + if (zio->io_error != EIO) + zio->io_error = rc->rc_error; + if (!rc->rc_skipped) + unexpected_errors++; + zio->io_numerrors++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * If this is not a failfast write, and we were able to + * write enough columns to reconstruct the data, good enough. + */ + /* XXPOLICY */ + if (zio->io_numerrors <= rm->rm_firstdatacol && + !(zio->io_flags & ZIO_FLAG_FAILFAST)) + zio->io_error = 0; + + vdev_raidz_map_free(zio); + zio_next_stage(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If there were no I/O errors, and the data checksums correctly, + * the read is complete. + */ + /* XXPOLICY */ + if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { + ASSERT(unexpected_errors == 0); + ASSERT(zio->io_error == 0); + + /* + * We know the data's good. If we read the parity, + * verify that it's good as well. If not, fix it. + */ + for (c = 0; c < rm->rm_firstdatacol; c++) { + void *orig; + rc = &rm->rm_col[c]; + if (!rc->rc_tried) + continue; + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct(rm, c); + if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { + vdev_checksum_error(zio, + vd->vdev_child[rc->rc_col]); + rc->rc_error = ECKSUM; + unexpected_errors++; + } + zio_buf_free(orig, rc->rc_size); + } + goto done; + } + + /* + * If there was exactly one I/O error, it's the one we expected, + * and the reconstructed data checksums, the read is complete. + * This happens when one child is offline and vdev_fault_assess() + * knows it, or when one child has stale data and the DTL knows it. + */ + if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { + rc = &rm->rm_col[c]; + ASSERT(unexpected_errors == 0); + ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); + vdev_raidz_reconstruct(rm, c); + if (zio_checksum_error(zio) == 0) { + zio->io_error = 0; + goto done; + } + } + + /* + * This isn't a typical error -- either we got a read error or + * more than one child claimed a problem. Read every block we + * haven't already so we can try combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missing_child = -1; + + for (c = 0; c < rm->rm_cols; c++) + if (!rm->rm_col[c].rc_tried) + break; + + if (c != rm->rm_cols) { + zio->io_error = 0; + zio_vdev_io_redone(zio); + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_tried) + continue; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_col], + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + vdev_raidz_child_done, rc)); + } + zio_wait_children_done(zio); + return; + } + + /* + * If there were more errors than parity disks, give up. + */ + if (zio->io_numerrors > rm->rm_firstdatacol) { + ASSERT(zio->io_error != 0); + goto done; + } + + /* + * The number of I/O errors is correctable. Correct them here. + */ + ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + ASSERT(rc->rc_tried); + if (rc->rc_error) { + vdev_raidz_reconstruct(rm, c); + if (zio_checksum_error(zio) == 0) + zio->io_error = 0; + else + zio->io_error = rc->rc_error; + goto done; + } + } + + /* + * There were no I/O errors, but the data doesn't checksum. + * Try all permutations to see if we can find one that does. + */ + ASSERT(zio->io_numerrors == 0); + for (c = 0; c < rm->rm_cols; c++) { + void *orig; + rc = &rm->rm_col[c]; + + orig = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, orig, rc->rc_size); + vdev_raidz_reconstruct(rm, c); + + if (zio_checksum_error(zio) == 0) { + zio_buf_free(orig, rc->rc_size); + zio->io_error = 0; + /* + * If this child didn't know that it returned bad data, + * inform it. + */ + if (rc->rc_tried && rc->rc_error == 0) + vdev_checksum_error(zio, + vd->vdev_child[rc->rc_col]); + rc->rc_error = ECKSUM; + goto done; + } + + bcopy(orig, rc->rc_data, rc->rc_size); + zio_buf_free(orig, rc->rc_size); + } + + /* + * All combinations failed to checksum. + */ + zio->io_error = ECKSUM; + +done: + zio_checksum_verified(zio); + + if (zio->io_error == 0 && (spa_mode & FWRITE) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_col]; + + if (rc->rc_error) { + /* + * Make a copy of the data because we're + * going to free the RAID-Z map below. + */ + void *data = zio_buf_alloc(rc->rc_size); + bcopy(rc->rc_data, data, rc->rc_size); + + dprintf("%s resilvered %s @ 0x%llx error %d\n", + vdev_description(vd), + vdev_description(cvd), + zio->io_offset, rc->rc_error); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, data, rc->rc_size, + ZIO_TYPE_WRITE, zio->io_priority, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE, + vdev_raidz_repair_done, NULL)); + } + } + } + + vdev_raidz_map_free(zio); + zio_next_stage(zio); +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > 1) + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_raidz_ops = { + vdev_raidz_open, + vdev_raidz_close, + vdev_raidz_asize, + vdev_raidz_io_start, + vdev_raidz_io_done, + vdev_raidz_state_change, + VDEV_TYPE_RAIDZ, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c new file mode 100644 index 0000000000..4e44b5bb05 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> + +/* + * Virtual device vector for the pool's root vdev. + */ + +static int +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +{ + vdev_t *cvd; + int c, error; + int lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if ((error = vdev_open(cvd)) != 0) { + lasterror = error; + continue; + } + + *asize += cvd->vdev_asize; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + if (lasterror) + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + + return (lasterror); +} + +static void +vdev_root_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_root_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > 0) + vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + else if (degraded != 0) + vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_root_ops = { + vdev_root_open, + vdev_root_close, + vdev_default_asize, + NULL, /* io_start - not applicable to the root */ + NULL, /* io_done - not applicable to the root */ + vdev_root_state_change, + VDEV_TYPE_ROOT, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c new file mode 100644 index 0000000000..1eddb9c250 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -0,0 +1,1010 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> + +#define MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10) + +static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx); +static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx); +static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, + dmu_tx_t *tx, krw_t lt); +static void zap_put_leaf(zap_leaf_t *l); +static void zap_leaf_pageout(dmu_buf_t *db, void *vl); + + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type; + + ASSERT(size == (1<<ZAP_BLOCK_SHIFT)); + block_type = *(uint64_t *)vbuf; + + switch (block_type) { + case ZBT_LEAF: + case BSWAP_64(ZBT_LEAF): + zap_leaf_byteswap(vbuf); + return; + case ZBT_HEADER: + case BSWAP_64(ZBT_HEADER): + default: + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT); + return; + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + dmu_buf_t *db; + zap_leaf_t *l; + int i; + zap_phys_t *zp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, + &zap->zap_f.zap_phys, zap_pageout); + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + + zp = zap->zap_f.zap_phys; + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size); + bzero(zp, sizeof (zap_phys_t)); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT; + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + + for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++) + zp->zap_leafs[i] = 1; /* block 1 will be the first leaf */ + + /* + * set up block 1 - the first leaf + */ + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<<ZAP_BLOCK_SHIFT); + dmu_buf_will_dirty(db, tx); + + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_dbuf = db; + l->l_phys = db->db_data; + + zap_leaf_init(l); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static void +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t b, newblk; + dmu_buf_t *db_old, *db_new; + int hepb = 1<<(ZAP_BLOCK_SHIFT-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx); + tbl->zt_nextblk = newblk; + ASSERT3U(tbl->zt_blks_copied, ==, 0); + dmu_prefetch(zap->zap_objset, zap->zap_object, + tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks << + ZAP_BLOCK_SHIFT); + } + + /* + * Copy the ptrtbl from the old to new location, leaving the odd + * entries blank as we go. + */ + + b = tbl->zt_blks_copied; + db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT); + dmu_buf_read(db_old); + + /* first half of entries in old[b] go to new[2*b+0] */ + db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << ZAP_BLOCK_SHIFT); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new); + + /* second half of entries in old[b] go to new[2*b+1] */ + db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << ZAP_BLOCK_SHIFT); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new); + + dmu_buf_rele(db_old); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + tbl->zt_blks_copied, tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << ZAP_BLOCK_SHIFT, + tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%lluk entries)\n", + tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } +} + +static uint64_t +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + uint64_t blk, off, oldval; + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", val, idx); + + blk = idx >> (ZAP_BLOCK_SHIFT-3); + off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1); + + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT); + dmu_buf_will_dirty(db, tx); + oldval = ((uint64_t *)db->db_data)[off]; + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db); + + if (tbl->zt_nextblk != 0) { + idx *= 2; + blk = idx >> (ZAP_BLOCK_SHIFT-3); + off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1); + + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT); + dmu_buf_will_dirty(db, tx); + ((uint64_t *)db->db_data)[off] = val; + ((uint64_t *)db->db_data)[off+1] = val; + dmu_buf_rele(db); + } + + return (oldval); +} + +static uint64_t +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx) +{ + uint64_t blk, off, val; + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + blk = idx >> (ZAP_BLOCK_SHIFT-3); + off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1); + + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT); + dmu_buf_read(db); + val = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db); + return (val); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + int i; + for (i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2*i+0] = lb; + dst[2*i+1] = lb; + } +} + +static void +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32) + return; + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* + * The ptrtbl can no longer be contained in the + * header block. Give it its own entire block, which + * will quadruple the size of the ptrtbl. + */ + uint64_t newblk; + dmu_buf_t *db_new; + + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + ZAP_PTRTBL_MIN_SHIFT); + ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); + + newblk = zap_allocate_blocks(zap, 1, tx); + db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << ZAP_BLOCK_SHIFT); + + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs, + db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT); + dmu_buf_rele(db_new); + + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + (ZAP_BLOCK_SHIFT-3)); + } else { + zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + zap_ptrtbl_transfer, tx); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + + ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); + + zap->zap_f.zap_phys->zap_num_entries += delta; + + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx) +{ + uint64_t newblk; + ASSERT(tx != NULL); + if (!RW_WRITE_HELD(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + } + newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) - + nblocks; + return (newblk); +} + + +/* + * This function doesn't increment zap_num_leafs because it's used to + * allocate a leaf chain, which doesn't count against zap_num_leafs. + * The directory must be held exclusively for this tx. + */ +zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + void *winner; + zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + /* hence we already dirtied zap->zap_dbuf */ + + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = zap_allocate_blocks(zap, 1, tx); + l->l_next = NULL; + l->l_dbuf = NULL; + l->l_phys = NULL; + + l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << ZAP_BLOCK_SHIFT); + winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + ASSERT(winner == NULL); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l); + + return (l); +} + +/* ARGSUSED */ +void +zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) +{ + /* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf); + /* XXX there are still holds on this block, so we can't free it? */ + /* dmu_free_range(zap->zap_objset, zap->zap_object, */ + /* offset, 1<<ZAP_BLOCK_SHIFT, tx); */ +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap->zap_f.zap_phys->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +static void +zap_put_leaf(zap_leaf_t *l) +{ + zap_leaf_t *nl = l->l_next; + while (nl) { + zap_leaf_t *nnl = nl->l_next; + rw_exit(&nl->l_rwlock); + dmu_buf_rele(nl->l_dbuf); + nl = nnl; + } + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf); +} + +_NOTE(ARGSUSED(0)) +static void +zap_leaf_pageout(dmu_buf_t *db, void *vl) +{ + zap_leaf_t *l = vl; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + zap_leaf_t *l, *winner; + + ASSERT(blkid != 0); + + l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, 0, 0, 0); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_next = NULL; + l->l_dbuf = db; + l->l_phys = NULL; + + winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_pageout(NULL, l); + l = winner; + } + + return (l); +} + +static zap_leaf_t * +zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt) +{ + dmu_buf_t *db; + zap_leaf_t *l; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + blkid << ZAP_BLOCK_SHIFT); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT); + ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT); + ASSERT(blkid != 0); + + dmu_buf_read(db); + l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise l->l_phys could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3U(l->lh_block_type, ==, ZBT_LEAF); + ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC); + + return (l); +} + +static zap_leaf_t * +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt) +{ + zap_leaf_t *l, *nl; + + l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt); + + nl = l; + while (nl->lh_next != 0) { + zap_leaf_t *nnl; + nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt); + nl->l_next = nnl; + nl = nnl; + } + + return (l); +} + +static uint64_t +zap_idx_to_blk(zap_t *zap, uint64_t idx) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); + return (zap->zap_f.zap_phys->zap_leafs[idx]); + } else { + return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx)); + } +} + +static void +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { + zap->zap_f.zap_phys->zap_leafs[idx] = blk; + } else { + (void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, blk, tx); + } +} + +static zap_leaf_t * +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt) +{ + uint64_t idx; + zap_leaf_t *l; + + ASSERT(zap->zap_dbuf == NULL || + zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); + idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt); + + ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix); + + return (l); +} + + +static zap_leaf_t * +zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx) +{ + zap_leaf_t *nl; + int prefix_diff, i, err; + uint64_t sibling; + + ASSERT3U(l->lh_prefix_len, <=, + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0) { + /* failed to upgrade */ + int old_prefix_len = l->lh_prefix_len; + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + zap_unlockdir(zap); + err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); + ASSERT3U(err, ==, 0); + ASSERT(!zap->zap_ismicro); + l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + + if (l->lh_prefix_len != old_prefix_len) + /* it split while our locks were down */ + return (l); + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + /* There's only one pointer to us. Chain on another leaf blk. */ + (void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx)); + dprintf("chaining leaf %x/%d\n", l->lh_prefix, + l->lh_prefix_len); + return (l); + } + + ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix); + + /* There's more than one pointer to us. Split this leaf. */ + nl = zap_leaf_split(zap, l, tx); + + /* set sibling pointers */ + prefix_diff = + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len; + sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff; + for (i = 0; i < (1ULL<<prefix_diff); i++) { + ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid); + zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + /* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */ + } + + zap->zap_f.zap_phys->zap_num_leafs++; + + if (hash & (1ULL << (64 - l->lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + l = nl; + } else { + zap_put_leaf(nl); + } + + return (l); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, + zap_leaf_t *l, dmu_tx_t *tx) +{ + int shift, err; + +again: + shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + + if (l->lh_prefix_len == shift && + (l->l_next != NULL || l->lh_nfree < MIN_FREE)) { + /* this leaf will soon make us grow the pointer table */ + + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + uint64_t blkid = l->l_blkid; + + zap_put_leaf(l); + zap_unlockdir(zap); + err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, &zap); + ASSERT3U(err, ==, 0); + l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER); + goto again; + } + + zap_put_leaf(l); + zap_grow_ptrtbl(zap, tx); + } else { + zap_put_leaf(l); + } +} + + +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (EINVAL); + } + + /* Make sure we won't overflow */ + if (integer_size * num_integers < num_integers) + return (EINVAL); + if (integer_size * num_integers > DMU_MAX_ACCESS) + return (EINVAL); + + return (0); +} + +/* + * Routines for maniplulating attributes. + */ +int +fzap_lookup(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_leaf_t *l; + int err; + uint64_t hash; + zap_entry_handle_t zeh; + + err = fzap_checksize(integer_size, num_integers); + if (err != 0) + return (err); + + hash = zap_hash(zap, name); + l = zap_deref_leaf(zap, hash, NULL, RW_READER); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err != 0) + goto out; + err = zap_entry_read(&zeh, integer_size, num_integers, buf); +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_leaf_t *l; + uint64_t hash; + int err; + zap_entry_handle_t zeh; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT(fzap_checksize(integer_size, num_integers) == 0); + + hash = zap_hash(zap, name); + l = zap_deref_leaf(zap, hash, tx, RW_WRITER); +retry: + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err == 0) { + err = EEXIST; + goto out; + } + ASSERT(err == ENOENT); + + /* XXX If this leaf is chained, split it if we can. */ + err = zap_entry_create(l, name, hash, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + l = zap_expand_leaf(zap, l, hash, tx); + goto retry; + } + +out: + if (lp) + *lp = l; + else + zap_put_leaf(l); + return (err); +} + +int +fzap_add(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + int err; + zap_leaf_t *l; + + err = fzap_checksize(integer_size, num_integers); + if (err != 0) + return (err); + + err = fzap_add_cd(zap, name, integer_size, num_integers, + val, ZAP_MAXCD, tx, &l); + + zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + return (err); +} + +int +fzap_update(zap_t *zap, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_leaf_t *l; + uint64_t hash; + int err, create; + zap_entry_handle_t zeh; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_checksize(integer_size, num_integers); + if (err != 0) + return (err); + + hash = zap_hash(zap, name); + l = zap_deref_leaf(zap, hash, tx, RW_WRITER); +retry: + err = zap_leaf_lookup(l, name, hash, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + /* XXX If this leaf is chained, split it if we can. */ + + if (create) { + err = zap_entry_create(l, name, hash, ZAP_MAXCD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + l = zap_expand_leaf(zap, l, hash, tx); + goto retry; + } + + zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + return (err); +} + +int +fzap_length(zap_t *zap, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + uint64_t hash; + zap_entry_handle_t zeh; + + hash = zap_hash(zap, name); + l = zap_deref_leaf(zap, hash, NULL, RW_READER); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err != 0) + goto out; + + if (integer_size) + *integer_size = zeh.zeh_integer_size; + if (num_integers) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) +{ + zap_leaf_t *l; + uint64_t hash; + int err; + zap_entry_handle_t zeh; + + hash = zap_hash(zap, name); + l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + err = zap_leaf_lookup(l, name, hash, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zap, -1, tx); + } + zap_put_leaf(l); + dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n", + zap->zap_objset, zap->zap_object, name, err); + return (err); +} + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) +{ + zap_cursor_t zc; + zap_attribute_t *za; + int err; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, os, zapobj); + (err = zap_cursor_retrieve(&zc, za)) == 0; + zap_cursor_advance(&zc)) { + if (za->za_first_integer == value) { + (void) strcpy(name, za->za_name); + break; + } + } + kmem_free(za, sizeof (zap_attribute_t)); + return (err); +} + + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err = ENOENT; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + +again: + l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER); + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1; + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + if (l->lh_prefix_len == 0 || zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(l); + goto again; + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(&zeh, + sizeof (za->za_name), za->za_name); + ASSERT(err == 0); + } + zap_put_leaf(l); + return (err); +} + + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + int i; + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER); + + zap_stats_leaf(zap, l, zs); + zap_put_leaf(l); + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT; + zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; + zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; + zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; + + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs, + 1 << ZAP_PTRTBL_MIN_SHIFT, zs); + } else { + int b; + + dmu_prefetch(zap->zap_objset, zap->zap_object, + zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT, + zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + ZAP_BLOCK_SHIFT); + + for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + + db = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << + ZAP_BLOCK_SHIFT); + dmu_buf_read(db); + zap_stats_ptrtbl(zap, db->db_data, + 1<<(ZAP_BLOCK_SHIFT-3), zs); + dmu_buf_rele(db); + } + } +} diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c new file mode 100644 index 0000000000..82b786d05a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zap_leaf.c @@ -0,0 +1,883 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The 512-byte leaf is broken into 32 16-byte chunks. + * chunk number n means l_chunk[n], even though the header precedes it. + * the names are stored null-terminated. + */ + +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/zap_leaf.h> +#include <sys/spa.h> +#include <sys/dmu.h> + +#define CHAIN_END 0xffff /* end of the chunk chain */ + +/* somewhat arbitrary, could go up to around 100k ... */ +#define MAX_ARRAY_BYTES (8<<10) + +#define NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * XXX This will >> by a negative number when + * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT. + */ +#define LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES-1) & \ + ((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len))) + +#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) + +/* #define MEMCHECK */ + + +static void +zap_memset(void *a, int c, size_t n) +{ + char *cp = a; + char *cpend = cp + n; + + while (cp < cpend) + *cp++ = c; +} + +static void +stv(int len, void *addr, uint64_t value) +{ + switch (len) { + case 1: + *(uint8_t *)addr = value; + return; + case 2: + *(uint16_t *)addr = value; + return; + case 4: + *(uint32_t *)addr = value; + return; + case 8: + *(uint64_t *)addr = value; + return; + } + ASSERT(!"bad int len"); +} + +static uint64_t +ldv(int len, const void *addr) +{ + switch (len) { + case 1: + return (*(uint8_t *)addr); + case 2: + return (*(uint16_t *)addr); + case 4: + return (*(uint32_t *)addr); + case 8: + return (*(uint64_t *)addr); + } + ASSERT(!"bad int len"); + return (0xFEEDFACEDEADBEEF); +} + +void +zap_leaf_byteswap(zap_leaf_phys_t *buf) +{ + int i; + + buf->l_hdr.lhr_block_type = BSWAP_64(buf->l_hdr.lhr_block_type); + buf->l_hdr.lhr_next = BSWAP_64(buf->l_hdr.lhr_next); + buf->l_hdr.lhr_prefix = BSWAP_64(buf->l_hdr.lhr_prefix); + buf->l_hdr.lhr_magic = BSWAP_32(buf->l_hdr.lhr_magic); + buf->l_hdr.lhr_nfree = BSWAP_16(buf->l_hdr.lhr_nfree); + buf->l_hdr.lhr_nentries = BSWAP_16(buf->l_hdr.lhr_nentries); + buf->l_hdr.lhr_prefix_len = BSWAP_16(buf->l_hdr.lhr_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) + buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); + + for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) { + struct zap_leaf_entry *le; + + switch (buf->l_chunk[i].l_free.lf_type) { + case ZAP_LEAF_ENTRY: + le = &buf->l_chunk[i].l_entry; + + le->le_type = BSWAP_8(le->le_type); + le->le_int_size = BSWAP_8(le->le_int_size); + le->le_next = BSWAP_16(le->le_next); + le->le_name_chunk = BSWAP_16(le->le_name_chunk); + le->le_name_length = BSWAP_16(le->le_name_length); + le->le_value_chunk = BSWAP_16(le->le_value_chunk); + le->le_value_length = BSWAP_16(le->le_value_length); + le->le_cd = BSWAP_32(le->le_cd); + le->le_hash = BSWAP_64(le->le_hash); + break; + case ZAP_LEAF_FREE: + buf->l_chunk[i].l_free.lf_type = + BSWAP_8(buf->l_chunk[i].l_free.lf_type); + buf->l_chunk[i].l_free.lf_next = + BSWAP_16(buf->l_chunk[i].l_free.lf_next); + break; + case ZAP_LEAF_ARRAY: + /* zap_leaf_array */ + buf->l_chunk[i].l_array.la_type = + BSWAP_8(buf->l_chunk[i].l_array.la_type); + buf->l_chunk[i].l_array.la_next = + BSWAP_16(buf->l_chunk[i].l_array.la_next); + /* la_array doesn't need swapping */ + break; + default: + ASSERT(!"bad leaf type"); + } + } +} + +void +zap_leaf_init(zap_leaf_t *l) +{ + int i; + + ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size); + zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); + zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash)); + for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) { + l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE; + l->l_phys->l_chunk[i].l_free.lf_next = i+1; + } + l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END; + l->lh_block_type = ZBT_LEAF; + l->lh_magic = ZAP_LEAF_MAGIC; + l->lh_nfree = ZAP_LEAF_NUMCHUNKS; +} + +zap_leaf_t * +zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl) +{ + nl->lh_prefix = l->lh_prefix; + nl->lh_prefix_len = l->lh_prefix_len; + nl->l_next = l->l_next; + l->l_next = nl; + nl->lh_next = l->lh_next; + l->lh_next = nl->l_blkid; + return (nl); +} + +/* + * Routines which manipulate leaf chunks (l_chunk[]). + */ + +static uint16_t +zap_leaf_chunk_alloc(zap_leaf_t *l) +{ + int chunk; + + ASSERT(l->lh_nfree > 0); + + chunk = l->l_phys->l_hdr.lh_freelist; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE); + + l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next; + +#ifdef MEMCHECK + zap_memset(&l->l_phys->l_chunk[chunk], 0xa1, + sizeof (l->l_phys->l_chunk[chunk])); +#endif + + l->lh_nfree--; + + return (chunk); +} + +static void +zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free; + ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + ASSERT(zlf->lf_type != ZAP_LEAF_FREE); + +#ifdef MEMCHECK + zap_memset(&l->l_phys->l_chunk[chunk], 0xf4, + sizeof (l->l_phys->l_chunk[chunk])); +#endif + + zlf->lf_type = ZAP_LEAF_FREE; + zlf->lf_next = l->l_phys->l_hdr.lh_freelist; + bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + l->l_phys->l_hdr.lh_freelist = chunk; + + l->lh_nfree++; +} + + +/* + * Routines which manipulate leaf arrays (zap_leaf_array type chunks). + */ + +static uint16_t +zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf, + int integer_size, int num_integers) +{ + uint16_t chunk_head; + uint16_t *chunkp = &chunk_head; + int byten = 0; + uint64_t value; + int shift = (integer_size-1)*8; + int len = num_integers; + zap_leaf_t *l = zeh->zeh_found_leaf; + + ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); + + while (len > 0) { + uint16_t chunk = zap_leaf_chunk_alloc(l); + struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array; + int i; + + la->la_type = ZAP_LEAF_ARRAY; + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + if (byten == 0) + value = ldv(integer_size, buf); + la->la_array[i] = (value & (0xff << shift)) >> shift; + value <<= 8; + if (++byten == integer_size) { + byten = 0; + buf += integer_size; + if (--len == 0) + break; + } + } + + *chunkp = chunk; + chunkp = &la->la_next; + } + *chunkp = CHAIN_END; + + return (chunk_head); +} + +static void +zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp) +{ + uint16_t chunk = *chunkp; + zap_leaf_t *l = zeh->zeh_found_leaf; + + *chunkp = CHAIN_END; + + while (chunk != CHAIN_END) { + int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next; + ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==, + ZAP_LEAF_ARRAY); + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + } +} + +/* array_len and buf_len are in integers, not bytes */ +static void +zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk, + int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, + char *buf) +{ + int len = MIN(array_len, buf_len); + int byten = 0; + uint64_t value = 0; + zap_leaf_t *l = zeh->zeh_found_leaf; + + ASSERT3U(array_int_len, <=, buf_int_len); + + while (len > 0) { + struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array; + int i; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + value = (value << 8) | la->la_array[i]; + byten++; + if (byten == array_int_len) { + stv(buf_int_len, buf, value); + byten = 0; + len--; + if (len == 0) + return; + buf += buf_int_len; + } + } + chunk = la->la_next; + } +} + +/* + * Only to be used on 8-bit arrays. + * array_len is actual len in bytes (not encoded le_value_length). + * buf is null-terminated. + */ +static int +zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk, + int array_len, const char *buf) +{ + int bseen = 0; + zap_leaf_t *l = zeh->zeh_found_leaf; + + while (bseen < array_len) { + struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array; + int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + if (bcmp(la->la_array, buf + bseen, toread)) + break; + chunk = la->la_next; + bseen += toread; + } + return (bseen == array_len); +} + +/* + * Routines which manipulate leaf entries. + */ + +int +zap_leaf_lookup(zap_leaf_t *l, + const char *name, uint64_t h, zap_entry_handle_t *zeh) +{ + uint16_t *chunkp; + struct zap_leaf_entry *le; + + zeh->zeh_head_leaf = l; + +again: + ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC); + + for (chunkp = LEAF_HASH_ENTPTR(l, h); + *chunkp != CHAIN_END; chunkp = &le->le_next) { + uint16_t chunk = *chunkp; + le = &l->l_phys->l_chunk[chunk].l_entry; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + if (le->le_hash != h) + continue; + + zeh->zeh_found_leaf = l; + if (zap_leaf_array_equal(zeh, le->le_name_chunk, + le->le_name_length, name)) { + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + zeh->zeh_found_leaf = l; + return (0); + } + } + + if (l->l_next) { + l = l->l_next; + goto again; + } + + return (ENOENT); +} + +/* Return (h1,cd1 >= h2,cd2) */ +static int +hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2) +{ + if (h1 > h2) + return (TRUE); + if (h1 == h2 && cd1 >= cd2) + return (TRUE); + return (FALSE); +} + +int +zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint64_t besth = -1ULL; + uint32_t bestcd = ZAP_MAXCD; + uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1; + uint16_t lh; + struct zap_leaf_entry *le; + + zeh->zeh_head_leaf = l; + +again: + ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC); + + for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (chunk = l->l_phys->l_hash[lh]; + chunk != CHAIN_END; chunk = le->le_next) { + le = &l->l_phys->l_chunk[chunk].l_entry; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + if (hcd_gteq(le->le_hash, le->le_cd, h, cd) && + hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) { + ASSERT3U(bestlh, >=, lh); + bestlh = lh; + besth = le->le_hash; + bestcd = le->le_cd; + + zeh->zeh_num_integers = le->le_value_length; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_fakechunk = chunk; + zeh->zeh_chunkp = &zeh->zeh_fakechunk; + zeh->zeh_found_leaf = l; + } + } + } + + if (l->l_next) { + l = l->l_next; + goto again; + } + + return (bestcd == ZAP_MAXCD ? ENOENT : 0); +} + +int +zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf) +{ + struct zap_leaf_entry *le; + + le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry; + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + if (le->le_int_size > integer_size) + return (EINVAL); + + zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size, + le->le_value_length, integer_size, num_integers, buf); + + if (zeh->zeh_num_integers > num_integers) + return (EOVERFLOW); + return (0); + +} + +int +zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) +{ + struct zap_leaf_entry *le; + + le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry; + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + zap_leaf_array_read(zeh, le->le_name_chunk, 1, + le->le_name_length, 1, buflen, buf); + if (le->le_name_length > buflen) + return (EOVERFLOW); + return (0); +} + +int +zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf) +{ + int delta_chunks; + struct zap_leaf_entry *le; + le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry; + + delta_chunks = NCHUNKS(num_integers * integer_size) - + NCHUNKS(le->le_value_length * le->le_int_size); + + if (zeh->zeh_found_leaf->lh_nfree < delta_chunks) + return (EAGAIN); + + /* + * We should search other chained leaves (via + * zap_entry_remove,create?) otherwise returning EAGAIN will + * just send us into an infinite loop if we have to chain + * another leaf block, rather than being able to split this + * block. + */ + + zap_leaf_array_free(zeh, &le->le_value_chunk); + le->le_value_chunk = + zap_leaf_array_create(zeh, buf, integer_size, num_integers); + le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ? + (MAX_ARRAY_BYTES + 1) : (num_integers); + le->le_int_size = integer_size; + return (0); +} + +void +zap_entry_remove(zap_entry_handle_t *zeh) +{ + uint16_t entry_chunk; + struct zap_leaf_entry *le; + zap_leaf_t *l = zeh->zeh_found_leaf; + + ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); + + entry_chunk = *zeh->zeh_chunkp; + le = &l->l_phys->l_chunk[entry_chunk].l_entry; + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + zap_leaf_array_free(zeh, &le->le_name_chunk); + zap_leaf_array_free(zeh, &le->le_value_chunk); + + *zeh->zeh_chunkp = le->le_next; + zap_leaf_chunk_free(l, entry_chunk); + + l->lh_nentries--; +} + +int +zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh) +{ + uint16_t chunk; + uint16_t *chunkp; + struct zap_leaf_entry *le; + uint64_t namelen, valuelen; + int numchunks; + + valuelen = integer_size * num_integers; + namelen = strlen(name) + 1; + ASSERT(namelen >= 2); + + zeh->zeh_head_leaf = l; + + if (namelen > MAXNAMELEN) + return (ENAMETOOLONG); + /* find the first leaf in the chain that has sufficient free space */ + numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen); + if (numchunks > ZAP_LEAF_NUMCHUNKS) + return (E2BIG); + + if (cd == ZAP_MAXCD) { + for (cd = 0; cd < ZAP_MAXCD; cd++) { + zap_leaf_t *ll; + for (ll = l; ll; ll = ll->l_next) { + for (chunk = *LEAF_HASH_ENTPTR(ll, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = &ll->l_phys->l_chunk + [chunk].l_entry; + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* + * if this cd is in use, no need to + * check more chained leafs + */ + if (chunk != CHAIN_END) + break; + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } + /* If we tried all the cd's, we lose. */ + if (cd == ZAP_MAXCD) + return (ENOSPC); + } + + for (; l; l = l->l_next) + if (l->lh_nfree >= numchunks) + break; + if (l == NULL) + return (EAGAIN); + + zeh->zeh_found_leaf = l; + + /* make the entry */ + chunk = zap_leaf_chunk_alloc(l); + le = &l->l_phys->l_chunk[chunk].l_entry; + le->le_type = ZAP_LEAF_ENTRY; + le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen); + le->le_name_length = namelen; + le->le_value_chunk = + zap_leaf_array_create(zeh, buf, integer_size, num_integers); + le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ? + (MAX_ARRAY_BYTES + 1) : (num_integers); + le->le_int_size = integer_size; + le->le_hash = h; + le->le_cd = cd; + + /* link it into the hash chain */ + chunkp = LEAF_HASH_ENTPTR(l, h); + le->le_next = *chunkp; + *chunkp = chunk; + + l->lh_nentries++; + + zeh->zeh_num_integers = num_integers; + zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + + return (0); +} + +/* + * Routines for transferring entries between leafs. + */ + +static void +zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +{ + struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry; + uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash); + le->le_next = *ptr; + *ptr = entry; +} + +static void +zap_leaf_rehash_entries(zap_leaf_t *l) +{ + int i; + + if (l->lh_nentries == 0) + return; + + /* break existing hash chains */ + zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash)); + + for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) { + struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry; + if (le->le_type != ZAP_LEAF_ENTRY) + continue; + zap_leaf_rehash_entry(l, i); + } +} + +static uint16_t +zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) +{ + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; + + while (chunk != CHAIN_END) { + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_array *nla = + &nl->l_phys->l_chunk[nchunk].l_array; + struct zap_leaf_array *la = + &l->l_phys->l_chunk[chunk].l_array; + int nextchunk = la->la_next; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS); + ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS); + + *nla = *la; + + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + *nchunkp = nchunk; + nchunkp = &nla->la_next; + } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +static void +zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl, + dmu_tx_t *tx) +{ + zap_leaf_t *nl; + struct zap_leaf_entry *le, *nle; + uint16_t chunk, nchunks; + + le = &l->l_phys->l_chunk[entry].l_entry; + ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY); + + /* find a leaf in the destination leaf chain with enough free space */ + nchunks = 1 + NCHUNKS(le->le_name_length) + + NCHUNKS(le->le_value_length * le->le_int_size); + for (nl = nhl; nl; nl = nl->l_next) + if (nl->lh_nfree >= nchunks) + break; + if (nl == NULL) { + nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx)); + dprintf("transfer_entry: chaining leaf %x/%d\n", + nl->lh_prefix, nl->lh_prefix_len); + } + + chunk = zap_leaf_chunk_alloc(nl); + nle = &nl->l_phys->l_chunk[chunk].l_entry; + *nle = *le; + + zap_leaf_rehash_entry(nl, chunk); + + nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); + nle->le_value_chunk = + zap_leaf_transfer_array(l, le->le_value_chunk, nl); + + zap_leaf_chunk_free(l, entry); + + l->lh_nentries--; + nl->lh_nentries++; +} + +/* + * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0. + * Ignore leaf chaining in source (l), but chain in destinations. + * We'll re-chain all the entries in l as we go along. + */ +static void +zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l, + zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx) +{ + int i; + + ASSERT(bit < 64 && bit >= 0); + /* break existing hash chains */ + zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash)); + + if (nl0 != l) + zap_leaf_rehash_entries(nl0); + if (nl1 != nl0) + zap_leaf_rehash_entries(nl1); + + for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) { + struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry; + if (le->le_type != ZAP_LEAF_ENTRY) + continue; + + /* + * We could find entries via hashtable instead. That + * would be O(hashents+numents) rather than + * O(numblks+numents), but this accesses memory more + * sequentially, and when we're called, the block is + * usually pretty full. + */ + + if (le->le_hash & (1ULL << bit)) { + zap_leaf_transfer_entry(zap, l, i, nl1, tx); + } else { + if (nl0 == l) + zap_leaf_rehash_entry(l, i); + else + zap_leaf_transfer_entry(zap, l, i, nl0, tx); + } + } + +} + +/* + * nl will contain the entries whose hash prefix ends in 1 + * handles leaf chaining + */ +zap_leaf_t * +zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx) +{ + zap_leaf_t *l = hl; + int bit = 64 - 1 - hl->lh_prefix_len; + zap_leaf_t *nl = zap_create_leaf(zap, tx); + + /* set new prefix and prefix_len */ + hl->lh_prefix <<= 1; + hl->lh_prefix_len++; + nl->lh_prefix = hl->lh_prefix | 1; + nl->lh_prefix_len = hl->lh_prefix_len; + + /* transfer odd entries from first leaf in hl chain to nl */ + zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx); + + /* take rest of chain off hl */ + l = hl->l_next; + hl->l_next = NULL; + hl->lh_next = 0; + + /* transfer even entries from hl chain back to hl, odd entries to nl */ + while (l) { + zap_leaf_t *next = l->l_next; + zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx); + zap_destroy_leaf(zap, l, tx); + l = next; + } + + return (nl); +} + +void +zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) +{ + int n, nchained = 0; + + n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_2n_pointers[n]++; + + do { + int i; + + n = l->lh_nentries/5; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_with_n5_entries[n]++; + + n = ((1<<ZAP_BLOCK_SHIFT) - + l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + (1<<ZAP_BLOCK_SHIFT); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_n_tenths_full[n]++; + + for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) { + int nentries = 0; + int chunk = l->l_phys->l_hash[i]; + + while (chunk != CHAIN_END) { + struct zap_leaf_entry *le = + &l->l_phys->l_chunk[chunk].l_entry; + + n = 1 + NCHUNKS(le->le_name_length) + + NCHUNKS(le->le_value_length * + le->le_int_size); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_entries_using_n_chunks[n]++; + + chunk = le->le_next; + nentries++; + } + + n = nentries; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_buckets_with_n_entries[n]++; + } + + nchained++; + l = l->l_next; + } while (l); + + n = nchained-1; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_n_chained[n]++; +} diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c new file mode 100644 index 0000000000..998b67c50f --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -0,0 +1,823 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zfs_context.h> +#include <sys/zap.h> +#include <sys/zap_impl.h> +#include <sys/avl.h> + + +static uint64_t mzap_write_cookie(zap_t *zap, uint64_t cookie, + uint64_t entptr); +static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx); + + +static void +mzap_byteswap(mzap_phys_t *buf, size_t size) +{ + int i, max; + buf->mz_block_type = BSWAP_64(buf->mz_block_type); + buf->mz_salt = BSWAP_64(buf->mz_salt); + max = (size / MZAP_ENT_LEN) - 1; + for (i = 0; i < max; i++) { + buf->mz_chunk[i].mze_value = + BSWAP_64(buf->mz_chunk[i].mze_value); + buf->mz_chunk[i].mze_cd = + BSWAP_32(buf->mz_chunk[i].mze_cd); + } +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type; + + block_type = *(uint64_t *)buf; + + switch (block_type) { + case ZBT_MICRO: + case BSWAP_64(ZBT_MICRO): + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + return; + default: + ASSERT(size == (1<<ZAP_BLOCK_SHIFT)); + fzap_byteswap(buf, size); + return; + } +} + +static int +mze_compare(const void *arg1, const void *arg2) +{ + const mzap_ent_t *mze1 = arg1; + const mzap_ent_t *mze2 = arg2; + + if (mze1->mze_hash > mze2->mze_hash) + return (+1); + if (mze1->mze_hash < mze2->mze_hash) + return (-1); + if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + return (+1); + if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + return (-1); + return (0); +} + +static void +mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +{ + mzap_ent_t *mze; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(mzep->mze_cd < ZAP_MAXCD); + ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash); + + mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze->mze_chunkid = chunkid; + mze->mze_hash = hash; + mze->mze_phys = *mzep; + avl_add(&zap->zap_m.zap_avl, mze); +} + +static mzap_ent_t * +mze_find(zap_t *zap, const char *name, uint64_t hash) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(zap_hash(zap, name), ==, hash); + + if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name)) + return (NULL); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_phys.mze_cd = 0; + + mze = avl_find(avl, &mze_tofind, &idx); + if (mze == NULL) + mze = avl_nearest(avl, idx, AVL_AFTER); + for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (strcmp(name, mze->mze_phys.mze_name) == 0) + return (mze); + } + return (NULL); +} + +static uint32_t +mze_find_unused_cd(zap_t *zap, uint64_t hash) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + uint32_t cd; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_phys.mze_cd = 0; + + cd = 0; + for (mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (mze->mze_phys.mze_cd != cd) + break; + cd++; + } + + return (cd); +} + +static void +mze_remove(zap_t *zap, mzap_ent_t *mze) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + avl_remove(&zap->zap_m.zap_avl, mze); + kmem_free(mze, sizeof (mzap_ent_t)); +} + +static void +mze_destroy(zap_t *zap) +{ + mzap_ent_t *mze; + void *avlcookie = NULL; + + while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) + kmem_free(mze, sizeof (mzap_ent_t)); + avl_destroy(&zap->zap_m.zap_avl); +} + +static zap_t * +mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +{ + zap_t *winner; + zap_t *zap; + int i; + + ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); + + zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + rw_init(&zap->zap_rwlock, 0, 0, 0); + rw_enter(&zap->zap_rwlock, RW_WRITER); + zap->zap_objset = os; + zap->zap_object = obj; + zap->zap_dbuf = db; + + if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) { + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); + } else { + zap->zap_ismicro = TRUE; + } + + /* + * Make sure that zap_ismicro is set before we let others see + * it, because zap_lockdir() checks zap_ismicro without the lock + * held. + */ + winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_pageout); + + if (winner != NULL) { + kmem_free(zap, sizeof (zap_t)); + return (winner); + } + + if (zap->zap_ismicro) { + zap->zap_salt = zap->zap_m.zap_phys->mz_salt; + zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + avl_create(&zap->zap_m.zap_avl, mze_compare, + sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); + + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = + &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0]) { + zap->zap_m.zap_num_entries++; + mze_insert(zap, i, + zap_hash(zap, mze->mze_name), mze); + } + } + } else { + zap->zap_salt = zap->zap_f.zap_phys->zap_salt; + } + rw_exit(&zap->zap_rwlock); + return (zap); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, int fatreader, zap_t **zapp) +{ + zap_t *zap; + dmu_buf_t *db; + krw_t lt; + int err; + + *zapp = NULL; + + db = dmu_buf_hold(os, obj, 0); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + /* + * The zap can deal with EIO here, but its callers don't yet, so + * spare them by doing a mustsucceed read. + */ + dmu_buf_read(db); + + zap = dmu_buf_get_user(db); + if (zap == NULL) + zap = mzap_open(os, obj, db); + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + (!zap->zap_ismicro && fatreader) ? RW_READER : lti); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > MZAP_MAX_BLKSZ) { + dprintf("upgrading obj %llu: num_entries=%u\n", + obj, zap->zap_m.zap_num_entries); + mzap_upgrade(zap, tx); + *zapp = zap; + return (0); + } + err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); + ASSERT3U(err, ==, 0); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + } + + *zapp = zap; + return (0); +} + +void +zap_unlockdir(zap_t *zap) +{ + rw_exit(&zap->zap_rwlock); + dmu_buf_rele(zap->zap_dbuf); +} + +static void +mzap_upgrade(zap_t *zap, dmu_tx_t *tx) +{ + mzap_phys_t *mzp; + int i, sz, nchunks, err; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + sz = zap->zap_dbuf->db_size; + mzp = kmem_alloc(sz, KM_SLEEP); + bcopy(zap->zap_dbuf->db_data, mzp, sz); + nchunks = zap->zap_m.zap_num_chunks; + + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << ZAP_BLOCK_SHIFT, 0, tx); + ASSERT(err == 0); + + dprintf("upgrading obj=%llu with %u chunks\n", + zap->zap_object, nchunks); + mze_destroy(zap); + + fzap_upgrade(zap, tx); + + for (i = 0; i < nchunks; i++) { + int err; + mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + if (mze->mze_name[0] == 0) + continue; + dprintf("adding %s=%llu\n", + mze->mze_name, mze->mze_value); + err = fzap_add_cd(zap, + mze->mze_name, 8, 1, &mze->mze_value, + mze->mze_cd, tx, NULL); + ASSERT3U(err, ==, 0); + } + kmem_free(mzp, sz); +} + +uint64_t +zap_hash(zap_t *zap, const char *name) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = zap->zap_salt; + + ASSERT(crc != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the onces that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + + return (crc); +} + + +static void +mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + mzap_phys_t *zp; + + db = dmu_buf_hold(os, obj, 0); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + } +#endif + + dmu_buf_will_dirty(db, tx); + zp = db->db_data; + zp->mz_block_type = ZBT_MICRO; + zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; + ASSERT(zp->mz_salt != 0); + dmu_buf_rele(db); +} + +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + int err; + + err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + if (err != 0) + return (err); + mzap_create_impl(os, obj, tx); + return (0); +} + +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + mzap_create_impl(os, obj, tx); + return (obj); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +_NOTE(ARGSUSED(0)) +void +zap_pageout(dmu_buf_t *db, void *vmzap) +{ + zap_t *zap = vmzap; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) { + mze_destroy(zap); + } + + kmem_free(zap, sizeof (zap_t)); +} + + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap); + return (err); +} + +/* + * Routines for maniplulating attributes. + */ + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_lookup(zap, name, + integer_size, num_integers, buf); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + err = ENOENT; + } else { + if (num_integers < 1) + err = EOVERFLOW; + else if (integer_size != 8) + err = EINVAL; + else + *(uint64_t *)buf = mze->mze_phys.mze_value; + } + } + zap_unlockdir(zap); + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_length(zap, name, integer_size, num_integers); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + err = ENOENT; + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_unlockdir(zap); + return (err); +} + +static void +mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value) +{ + int i; + int start = zap->zap_m.zap_alloc_next; + uint32_t cd; + + dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + +#ifdef ZFS_DEBUG + for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + ASSERT(strcmp(name, mze->mze_name) != 0); + } +#endif + + cd = mze_find_unused_cd(zap, hash); + /* given the limited size of the microzap, this can't happen */ + ASSERT(cd != ZAP_MAXCD); + +again: + for (i = start; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + if (mze->mze_name[0] == 0) { + mze->mze_value = value; + mze->mze_cd = cd; + (void) strcpy(mze->mze_name, name); + zap->zap_m.zap_num_entries++; + zap->zap_m.zap_alloc_next = i+1; + if (zap->zap_m.zap_alloc_next == + zap->zap_m.zap_num_chunks) + zap->zap_m.zap_alloc_next = 0; + mze_insert(zap, i, hash, mze); + return; + } + } + if (start != 0) { + start = 0; + goto again; + } + ASSERT(!"out of entries!"); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + const uint64_t *intval = val; + uint64_t hash; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_add(zap, name, integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + mzap_upgrade(zap, tx); + err = fzap_add(zap, name, integer_size, num_integers, val, tx); + } else { + hash = zap_hash(zap, name); + mze = mze_find(zap, name, hash); + if (mze != NULL) { + err = EEXIST; + } else { + mzap_addent(zap, name, hash, *intval); + } + } + zap_unlockdir(zap); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + mzap_ent_t *mze; + const uint64_t *intval = val; + uint64_t hash; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + if (!zap->zap_ismicro) { + err = fzap_update(zap, name, + integer_size, num_integers, val, tx); + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + mzap_upgrade(zap, tx); + err = fzap_update(zap, name, + integer_size, num_integers, val, tx); + } else { + hash = zap_hash(zap, name); + mze = mze_find(zap, name, hash); + if (mze != NULL) { + mze->mze_phys.mze_value = *intval; + zap->zap_m.zap_phys->mz_chunk + [mze->mze_chunkid].mze_value = *intval; + } else { + mzap_addent(zap, name, hash, *intval); + } + } + zap_unlockdir(zap); + return (0); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + mzap_ent_t *mze; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_remove(zap, name, tx); + } else { + mze = mze_find(zap, name, zap_hash(zap, name)); + if (mze == NULL) { + dprintf("fail: %s\n", name); + err = ENOENT; + } else { + dprintf("success: %s\n", name); + zap->zap_m.zap_num_entries--; + bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + sizeof (mzap_ent_phys_t)); + mze_remove(zap, mze); + } + } + zap_unlockdir(zap); + return (err); +} + + +/* + * Routines for iterating over the attributes. + */ + +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zc->zc_objset = os; + zc->zc_zapobj = zapobj; + zc->zc_hash = 0; + zc->zc_cd = 0; +} + +/* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So use a small hash value so + * we can fit 4 bits of cd into the 32-bit cursor. + * + * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] + */ +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zc->zc_objset = os; + zc->zc_zapobj = zapobj; + if (serialized == -1ULL) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + zc->zc_hash = serialized << (64-ZAP_HASHBITS); + zc->zc_cd = serialized >> ZAP_HASHBITS; + if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ + zc->zc_cd = 0; + } +} + +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); + ASSERT(zc->zc_cd < ZAP_MAXCD); + return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | + ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); +} + +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) +{ + zap_t *zap; + int err; + avl_index_t idx; + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + + if (zc->zc_hash == -1ULL) + return (ENOENT); + + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, &zap); + if (err) + return (err); + if (!zap->zap_ismicro) { + err = fzap_cursor_retrieve(zap, zc, za); + } else { + err = ENOENT; + + mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_phys.mze_cd = zc->zc_cd; + + mze = avl_find(&zap->zap_m.zap_avl, &mze_tofind, &idx); + ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys, + &zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + sizeof (mze->mze_phys))); + if (mze == NULL) + mze = avl_nearest(&zap->zap_m.zap_avl, idx, AVL_AFTER); + + if (mze) { + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mze->mze_phys.mze_value; + (void) strcpy(za->za_name, mze->mze_phys.mze_name); + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_phys.mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + } + } + zap_unlockdir(zap); + return (err); +} + +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; + if (zc->zc_cd >= ZAP_MAXCD) { + zc->zc_cd = 0; + zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); + if (zc->zc_hash == 0) /* EOF */ + zc->zc_hash = -1ULL; + } +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + int err; + zap_t *zap; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + if (err) + return (err); + + bzero(zs, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap); + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/zfs.conf b/usr/src/uts/common/fs/zfs/zfs.conf new file mode 100644 index 0000000000..09881909b8 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +name="zfs" parent="pseudo"; diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c new file mode 100644 index 0000000000..960de720d1 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_acl.c @@ -0,0 +1,1537 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/fs/zfs.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <util/qsort.h> +#include "fs/fs_subr.h" +#include <acl/acl_common.h> + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +#define SECURE_NO_INHERIT (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define OGE_PAD 6 /* traditional owner/group/everyone ACES */ + +static int zfs_ace_can_use(znode_t *zp, ace_t *); + +static zfs_acl_t * +zfs_acl_alloc(int slots) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + if (slots != 0) { + aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP); + aclp->z_acl_count = 0; + aclp->z_state = ACL_DATA_ALLOCED; + } else { + aclp->z_state = 0; + } + aclp->z_slots = slots; + return (aclp); +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + if (aclp->z_state == ACL_DATA_ALLOCED) { + kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots)); + } + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static uint32_t +zfs_v4_to_unix(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY)) + new_mask |= S_IROTH; + if (access_mask & (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_ADD_FILE)) + new_mask |= S_IWOTH; + if (access_mask & (ACE_EXECUTE|ACE_READ_NAMED_ATTRS)) + new_mask |= S_IXOTH; + + return (new_mask); +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & 01) + new_mask |= (ACE_EXECUTE); + if (access_mask & 02) { + new_mask |= (ACE_WRITE_DATA); + } if (access_mask & 04) { + new_mask |= ACE_READ_DATA; + } + return (new_mask); +} + +static void +zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type, + uid_t uid, int entry_type) +{ + zacep->a_access_mask = access_mask; + zacep->a_type = access_type; + zacep->a_who = uid; + zacep->a_flags = entry_type; +} + +static uint64_t +zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) +{ + int i; + int entry_type; + mode_t mode = (zp->z_phys->zp_mode & + (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + mode_t seen = 0; + ace_t *acep; + + for (i = 0, acep = aclp->z_acl; + i != aclp->z_acl_count; i++, acep++) { + entry_type = (acep->a_flags & 0xf040); + if (entry_type == ACE_OWNER) { + if ((acep->a_access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (acep->a_type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (acep->a_type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((acep->a_access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (acep->a_type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((acep->a_access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (acep->a_type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (acep->a_type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((acep->a_access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (acep->a_type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((acep->a_access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (acep->a_type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (acep->a_type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (acep->a_type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((acep->a_access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (acep->a_type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (acep->a_type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (acep->a_type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((acep->a_access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (acep->a_type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (acep->a_type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (acep->a_type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } + } + return (mode); +} + +static zfs_acl_t * +zfs_acl_node_read_internal(znode_t *zp) +{ + zfs_acl_t *aclp; + + aclp = zfs_acl_alloc(0); + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0]; + + return (aclp); +} + +/* + * Read an external acl object. + */ +zfs_acl_t * +zfs_acl_node_read(znode_t *zp) +{ + uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; + zfs_acl_t *aclp; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) + return (zfs_acl_node_read_internal(zp)); + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count); + + dmu_read(zp->z_zfsvfs->z_os, extacl, 0, + ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl); + + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + + return (aclp); +} + +static boolean_t +zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit) +{ + ace_t *acep; + int i; + + *inherit = 0; + + if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) { + return (B_FALSE); + } + + for (i = 0, acep = uace; i != aclcnt; i++, acep++) { + + /* + * first check type of entry + */ + + switch (acep->a_flags & 0xf040) { + case ACE_OWNER: + acep->a_who = -1; + break; + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case ACE_IDENTIFIER_GROUP: + if (acep->a_flags & ACE_GROUP) { + acep->a_who = -1; + } + break; + case ACE_EVERYONE: + acep->a_who = -1; + break; + } + + /* + * next check inheritance level flags + */ + + if (acep->a_type != ALLOW && acep->a_type != DENY) + return (B_FALSE); + + /* + * Only directories should have inheritance flags. + */ + if (ZTOV(zp)->v_type != VDIR && (acep->a_flags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { + return (B_FALSE); + } + + if (acep->a_flags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) + *inherit = 1; + + if (acep->a_flags & + (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((acep->a_flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + } + + return (B_TRUE); +} +/* + * common code for setting acl's. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp) +{ + int inherit = 0; + int error; + znode_phys_t *zphys = zp->z_phys; + zfs_znode_acl_t *zacl = &zphys->zp_acl; + uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (ihp) + inherit = *ihp; /* already determined by caller */ + else if (!zfs_acl_valid(zp, aclp->z_acl, + aclp->z_acl_count, &inherit)) { + return (EINVAL); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Will ACL fit internally? + */ + if (aclp->z_acl_count > ACE_SLOT_CNT) { + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, + acl_phys_size, 0, tx); + } + zphys->zp_acl.z_acl_extern_obj = aoid; + zphys->zp_acl.z_acl_count = aclp->z_acl_count; + dmu_write(zfsvfs->z_os, aoid, 0, + acl_phys_size, aclp->z_acl, tx); + } else { + /* + * Migrating back embedded? + */ + if (zphys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + zphys->zp_acl.z_acl_extern_obj = 0; + } + bcopy(aclp->z_acl, zacl->z_ace_data, + aclp->z_acl_count * sizeof (ace_t)); + zacl->z_acl_count = aclp->z_acl_count; + } + if (inherit) + zp->z_phys->zp_flags |= ZFS_INHERIT_ACE; + else + zp->z_phys->zp_flags &= ~ZFS_INHERIT_ACE; + + zphys->zp_mode = zfs_mode_compute(zp, aclp); + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + + return (0); +} + +/* + * Create space for slots_needed ACEs to be append + * to aclp. + */ +static void +zfs_acl_append(zfs_acl_t *aclp, int slots_needed) +{ + ace_t *newacep; + ace_t *oldaclp; + int slot_cnt; + int slots_left = aclp->z_slots - aclp->z_acl_count; + + if (aclp->z_state == ACL_DATA_ALLOCED) + ASSERT(aclp->z_slots >= aclp->z_acl_count); + if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) { + slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left); + newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP); + bcopy(aclp->z_acl, newacep, + ZFS_ACL_SIZE(aclp->z_acl_count)); + oldaclp = aclp->z_acl; + if (aclp->z_state == ACL_DATA_ALLOCED) + kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots)); + aclp->z_acl = newacep; + aclp->z_slots = slot_cnt; + aclp->z_state = ACL_DATA_ALLOCED; + } +} + +/* + * Remove "slot" ACE from aclp + */ +static void +zfs_ace_remove(zfs_acl_t *aclp, int slot) +{ + if (aclp->z_acl_count > 1) { + (void) memmove(&aclp->z_acl[slot], + &aclp->z_acl[slot +1], sizeof (ace_t) * + (--aclp->z_acl_count - slot)); + } else + aclp->z_acl_count--; +} + +/* + * Update access mask for prepended ACE + * + * This applies the "groupmask" value for aclmode property. + */ +static void +zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) +{ + + int rmask, wmask, xmask; + int user_ace; + + user_ace = (!(acep->a_flags & + (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); + + if (user_ace && (acep->a_who == owner)) { + rmask = S_IRUSR; + wmask = S_IWUSR; + xmask = S_IXUSR; + } else { + rmask = S_IRGRP; + wmask = S_IWGRP; + xmask = S_IXGRP; + } + + if (origacep->a_access_mask & ACE_READ_DATA) { + if (mode & rmask) + acep->a_access_mask &= ~ACE_READ_DATA; + else + acep->a_access_mask |= ACE_READ_DATA; + } + + if (origacep->a_access_mask & ACE_WRITE_DATA) { + if (mode & wmask) + acep->a_access_mask &= ~ACE_WRITE_DATA; + else + acep->a_access_mask |= ACE_WRITE_DATA; + } + + if (origacep->a_access_mask & ACE_APPEND_DATA) { + if (mode & wmask) + acep->a_access_mask &= ~ACE_APPEND_DATA; + else + acep->a_access_mask |= ACE_APPEND_DATA; + } + + if (origacep->a_access_mask & ACE_EXECUTE) { + if (mode & xmask) + acep->a_access_mask &= ~ACE_EXECUTE; + else + acep->a_access_mask |= ACE_EXECUTE; + } +} + +/* + * Apply mode to canonical six ACEs. + */ +static void +zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) +{ + int cnt; + ace_t *acep; + + cnt = aclp->z_acl_count -1; + acep = aclp->z_acl; + + /* + * Fixup final ACEs to match the mode + */ + + ASSERT(cnt >= 5); + adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */ + adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */ + adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */ +} + + +static int +zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask) +{ + return (acep->a_access_mask == mask && acep->a_type == allow_deny && + ((acep->a_flags & 0xf040) == type)); +} + +/* + * Can prepended ACE be reused? + */ +static int +zfs_reuse_deny(ace_t *acep, int i) +{ + int okay_masks; + + if (i < 1) + return (B_FALSE); + + if (acep[i-1].a_type != DENY) + return (B_FALSE); + + if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP)) + return (B_FALSE); + + okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS); + + if (acep[i-1].a_access_mask & ~okay_masks) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Create space to prepend an ACE + */ +static void +zfs_acl_prepend(zfs_acl_t *aclp, int i) +{ + ace_t *oldaclp = NULL; + ace_t *to, *from; + int slots_left = aclp->z_slots - aclp->z_acl_count; + int oldslots; + int need_free = 0; + + if (aclp->z_state == ACL_DATA_ALLOCED) + ASSERT(aclp->z_slots >= aclp->z_acl_count); + + if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) { + + to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count + + OGE_PAD), KM_SLEEP); + if (aclp->z_state == ACL_DATA_ALLOCED) + need_free++; + from = aclp->z_acl; + oldaclp = aclp->z_acl; + (void) memmove(to, from, + sizeof (ace_t) * aclp->z_acl_count); + aclp->z_state = ACL_DATA_ALLOCED; + } else { + from = aclp->z_acl; + to = aclp->z_acl; + } + + + (void) memmove(&to[i + 1], &from[i], + sizeof (ace_t) * (aclp->z_acl_count - i)); + + if (oldaclp) { + aclp->z_acl = to; + oldslots = aclp->z_slots; + aclp->z_slots = aclp->z_acl_count + OGE_PAD; + if (need_free) + kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots)); + } + +} + +/* + * Prepend deny ACE + */ +static void +zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i, + mode_t mode) +{ + ace_t *acep; + + zfs_acl_prepend(aclp, i); + + acep = aclp->z_acl; + zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who, + (acep[i + 1].a_flags & 0xf040)); + zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid); + aclp->z_acl_count++; +} + +/* + * Split an inherited ACE into inherit_only ACE + * and original ACE with inheritance flags stripped off. + */ +static void +zfs_acl_split_ace(zfs_acl_t *aclp, int i) +{ + ace_t *acep = aclp->z_acl; + + zfs_acl_prepend(aclp, i); + acep = aclp->z_acl; + acep[i] = acep[i + 1]; + acep[i].a_flags |= ACE_INHERIT_ONLY_ACE; + acep[i + 1].a_flags &= ~ALL_INHERIT; + aclp->z_acl_count++; +} + +/* + * Are ACES started at index i, the canonical six ACES? + */ +static int +zfs_have_canonical_six(zfs_acl_t *aclp, int i) +{ + ace_t *acep = aclp->z_acl; + + if ((zfs_acl_ace_match(&acep[i], + DENY, ACE_OWNER, 0) && + zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER, + OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2], + DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3], + ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4], + DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && + zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE, + EVERYONE_ALLOW_MASK))) { + return (1); + } else { + return (0); + } +} + +/* + * Apply step 1g, to group entries + * + * Need to deal with corner case where group may have + * greater permissions than owner. If so then limit + * group permissions, based on what extra permissions + * group has. + */ +static void +zfs_fixup_group_entries(ace_t *acep, mode_t mode) +{ + mode_t extramode = (mode >> 3) & 07; + mode_t ownermode = (mode >> 6); + + if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) { + + extramode &= ~ownermode; + + if (extramode) { + if (extramode & 04) { + acep[0].a_access_mask &= ~ACE_READ_DATA; + acep[1].a_access_mask &= ~ACE_READ_DATA; + } + if (extramode & 02) { + acep[0].a_access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + acep[1].a_access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + } + if (extramode & 01) { + acep[0].a_access_mask &= ~ACE_EXECUTE; + acep[1].a_access_mask &= ~ACE_EXECUTE; + } + } + } +} + +/* + * Apply the chmod algorithm as described + * in PSARC/2002/240 + */ +static int +zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, + dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *acep; + int i; + int error; + int entry_type; + int reuse_deny; + int need_canonical_six = 1; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT(MUTEX_HELD(&zp->z_lock)); + + i = 0; + while (i < aclp->z_acl_count) { + acep = aclp->z_acl; + entry_type = (acep[i].a_flags & 0xf040); + + if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) || + (acep[i].a_flags & ACE_INHERIT_ONLY_ACE)) { + i++; + continue; + } + + + if (zfsvfs->z_acl_mode == DISCARD) { + zfs_ace_remove(aclp, i); + continue; + } + + /* + * Need to split ace into two? + */ + if ((acep[i].a_flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) && + (!(acep[i].a_flags & ACE_INHERIT_ONLY_ACE))) { + zfs_acl_split_ace(aclp, i); + i++; + continue; + } + + if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) { + acep[i].a_access_mask &= ~OGE_CLEAR; + i++; + continue; + + } else { + if (acep[i].a_type == ALLOW) { + + /* + * Check preceding ACE if any, to see + * if we need to prepend a DENY ACE. + * This is only applicable when the acl_mode + * property == groupmask. + */ + if (zfsvfs->z_acl_mode == GROUPMASK) { + + reuse_deny = zfs_reuse_deny(acep, i); + + if (reuse_deny == B_FALSE) { + zfs_acl_prepend_deny(zp, aclp, + i, mode); + i++; + acep = aclp->z_acl; + } else { + zfs_acl_prepend_fixup( + &acep[i - 1], + &acep[i], mode, + zp->z_phys->zp_uid); + } + zfs_fixup_group_entries(&acep[i - 1], + mode); + } + } + i++; + } + } + + /* + * Check out last six aces, if we have six. + */ + + if (aclp->z_acl_count >= 6) { + i = aclp->z_acl_count - 6; + + if (zfs_have_canonical_six(aclp, i)) { + need_canonical_six = 0; + } + } + + if (need_canonical_six) { + + zfs_acl_append(aclp, 6); + i = aclp->z_acl_count; + acep = aclp->z_acl; + zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER); + zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); + zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP); + zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP); + zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK, + DENY, -1, ACE_EVERYONE); + zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK, + ALLOW, -1, ACE_EVERYONE); + aclp->z_acl_count += 6; + } + + zfs_acl_fixup_canonical_six(aclp, mode); + + zp->z_phys->zp_mode = mode; + error = zfs_aclset_common(zp, aclp, tx, NULL); + return (error); +} + + +int +zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) +{ + zfs_acl_t *aclp; + int error; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + mutex_enter(&zp->z_acl_lock); + aclp = zfs_acl_node_read(zp); + error = zfs_acl_chmod(zp, mode, aclp, tx); + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + return (error); +} + +/* + * strip off write_owner and write_acl + */ +static void +zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep) +{ + if ((zfsvfs->z_acl_inherit == SECURE) && + acep->a_type == ALLOW) + acep->a_access_mask &= ~SECURE_NO_INHERIT; +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *pacep; + ace_t *acep; + int ace_cnt = 0; + int pace_cnt; + int i, j; + zfs_acl_t *aclp = NULL; + + i = j = 0; + pace_cnt = paclp->z_acl_count; + pacep = paclp->z_acl; + if (zfsvfs->z_acl_inherit != DISCARD) { + for (i = 0; i != pace_cnt; i++) { + + if (zfsvfs->z_acl_inherit == NOALLOW && + pacep[i].a_type == ALLOW) + continue; + + if (zfs_ace_can_use(zp, &pacep[i])) { + ace_cnt++; + if (!(pacep[i].a_flags & + ACE_NO_PROPAGATE_INHERIT_ACE)) + ace_cnt++; + } + } + } + + aclp = zfs_acl_alloc(ace_cnt + OGE_PAD); + if (ace_cnt && zfsvfs->z_acl_inherit != DISCARD) { + acep = aclp->z_acl; + pacep = paclp->z_acl; + for (i = 0; i != pace_cnt; i++) { + + if (zfsvfs->z_acl_inherit == NOALLOW && + pacep[i].a_type == ALLOW) + continue; + + if (zfs_ace_can_use(zp, &pacep[i])) { + /* + * Now create entry for inherited ace + */ + acep[j] = pacep[i]; + + if (pacep[i].a_flags & + ACE_NO_PROPAGATE_INHERIT_ACE) { + acep[j].a_flags &= ~ALL_INHERIT; + j++; + continue; + } + + if (pacep[i].a_type != ALLOW && + pacep[i].a_type != DENY) { + zfs_securemode_update(zfsvfs, &acep[j]); + j++; + continue; + } + + if (ZTOV(zp)->v_type != VDIR) { + acep[j].a_flags &= ~ALL_INHERIT; + zfs_securemode_update(zfsvfs, &acep[j]); + j++; + continue; + } + + ASSERT(ZTOV(zp)->v_type == VDIR); + + /* + * If we are inheriting an ACE targeted for + * only files, then leave the inherit_only + * one for future propagation. + */ + if ((acep[j].a_flags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) != + ACE_FILE_INHERIT_ACE) + acep[j].a_flags &= + ~ACE_INHERIT_ONLY_ACE; + + zfs_securemode_update(zfsvfs, &acep[j]); + j++; + } + } + } + aclp->z_acl_count = j; + ASSERT(aclp->z_slots >= aclp->z_acl_count); + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + */ +void +zfs_perm_init(znode_t *zp, znode_t *parent, int flag, + vattr_t *vap, dmu_tx_t *tx, cred_t *cr) +{ + uint64_t mode; + uid_t uid; + gid_t gid; + int error; + int pull_down; + zfs_acl_t *aclp, *paclp; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); + + /* + * Determine uid and gid. + */ + if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + uid = vap->va_uid; + gid = vap->va_gid; + } else { + uid = crgetuid(cr); + if ((vap->va_mask & AT_GID) && + ((vap->va_gid == parent->z_phys->zp_gid) || + groupmember(vap->va_gid, cr) || + secpolicy_vnode_create_gid(cr))) + gid = vap->va_gid; + else + gid = (parent->z_phys->zp_mode & S_ISGID) ? + parent->z_phys->zp_gid : crgetgid(cr); + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) + mode |= S_ISGID; + else { + if ((mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + mode &= ~S_ISGID; + } + + zp->z_phys->zp_uid = uid; + zp->z_phys->zp_gid = gid; + zp->z_phys->zp_mode = mode; + + mutex_enter(&parent->z_lock); + pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE); + if (pull_down) { + mutex_enter(&parent->z_acl_lock); + paclp = zfs_acl_node_read(parent); + mutex_exit(&parent->z_acl_lock); + aclp = zfs_acl_inherit(zp, paclp); + zfs_acl_free(paclp); + } else { + aclp = zfs_acl_alloc(6); + } + mutex_exit(&parent->z_lock); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + error = zfs_acl_chmod(zp, mode, aclp, tx); + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + ASSERT3U(error, ==, 0); + zfs_acl_free(aclp); +} + +/* + * Can use be used for inheritance + */ +static int +zfs_ace_can_use(znode_t *zp, ace_t *acep) +{ + int vtype = ZTOV(zp)->v_type; + + int iflags = (acep->a_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + + else if (iflags & ACE_FILE_INHERIT_ACE) + return (1); + + return (0); +} + +/* + * Retrieve a files ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + int error; + + if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) { + /* + * If owner of file then allow reading of the + * ACL. + */ + if (crgetuid(cr) != zp->z_phys->zp_uid) + return (error); + } + + if (mask == 0) + return (ENOSYS); + + mutex_enter(&zp->z_acl_lock); + + aclp = zfs_acl_node_read(zp); + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = aclp->z_acl_count; + } + + if (mask & VSA_ACE) { + vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count * + sizeof (ace_t), KM_SLEEP); + bcopy(aclp->z_acl, vsecp->vsa_aclentp, + aclp->z_acl_count * sizeof (ace_t)); + } + + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +/* + * Set a files ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ace_t *acep = vsecp->vsa_aclentp; + int aclcnt = vsecp->vsa_aclcnt; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + int inherit; + zfs_acl_t *aclp; + uint64_t seq = 0; + + if (mask == 0) + return (EINVAL); + + if (!zfs_acl_valid(zp, acep, aclcnt, &inherit)) + return (EINVAL); +top: + error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr); + if (error == EACCES || error == ACCESS_UNDETERMINED) { + if ((error = secpolicy_vnode_setdac(cr, + zp->z_phys->zp_uid)) != 0) { + return (error); + } + } else if (error) { + return (error == EROFS ? error : EPERM); + } + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj, + 0, ZFS_ACL_SIZE(aclcnt)); + } else if (aclcnt > ACE_SLOT_CNT) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt)); + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + return (error); + } + + aclp = zfs_acl_alloc(aclcnt); + bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt); + aclp->z_acl_count = aclcnt; + error = zfs_aclset_common(zp, aclp, tx, &inherit); + ASSERT(error == 0); + + zfs_acl_free(aclp); + seq = zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep); + dmu_tx_commit(tx); +done: + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + zil_commit(zilog, seq, 0); + + return (error); +} + +static int +zfs_ace_access(ace_t *zacep, int mode_wanted, int *working_mode) +{ + if ((*working_mode & mode_wanted) == mode_wanted) { + return (0); + } + + if (zacep->a_access_mask & mode_wanted) { + if (zacep->a_type == ALLOW) { + *working_mode |= (mode_wanted & zacep->a_access_mask); + if ((*working_mode & mode_wanted) == mode_wanted) + return (0); + } else if (zacep->a_type == DENY) { + return (EACCES); + } + } + + /* + * haven't been specifcally denied at this point + * so return UNDETERMINED. + */ + + return (ACCESS_UNDETERMINED); +} + + +static int +zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) +{ + zfs_acl_t *aclp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ace_t *zacep; + gid_t gid; + int mode_wanted = v4_mode; + int cnt; + int i; + int access_deny = ACCESS_UNDETERMINED; + uint_t entry_type; + uid_t uid = crgetuid(cr); + + *working_mode = 0; + + if (zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + return (0); + + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)))) { + return (EROFS); + } + + mutex_enter(&zp->z_acl_lock); + + aclp = zfs_acl_node_read(zp); + + zacep = aclp->z_acl; + cnt = aclp->z_acl_count; + + for (i = 0; i != cnt; i++) { + + if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE) + continue; + + entry_type = (zacep[i].a_flags & 0xf040); + switch (entry_type) { + case ACE_OWNER: + if (uid == zp->z_phys->zp_uid) { + access_deny = zfs_ace_access(&zacep[i], + mode_wanted, working_mode); + } + break; + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case ACE_IDENTIFIER_GROUP: + /* + * Owning group gid is in znode not ACL + */ + if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP)) + gid = zp->z_phys->zp_gid; + else + gid = zacep[i].a_who; + + if (groupmember(gid, cr)) { + access_deny = zfs_ace_access(&zacep[i], + mode_wanted, working_mode); + } + break; + case ACE_EVERYONE: + access_deny = zfs_ace_access(&zacep[i], + mode_wanted, working_mode); + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + if (uid == zacep[i].a_who) { + access_deny = zfs_ace_access(&zacep[i], + mode_wanted, working_mode); + } + break; + } + zfs_acl_free(aclp); + mutex_exit(&zp->z_acl_lock); + return (EIO); + } + + if (access_deny != ACCESS_UNDETERMINED) + break; + + } + + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + + return (access_deny); +} + + +/* + * Determine whether Access should be granted/denied, invoking least + * priv subsytem when a deny is determined. + */ +int +zfs_zaccess(znode_t *zp, int mode, cred_t *cr) +{ + int working_mode = 0; + int error; + int is_attr; + znode_t *xzp; + znode_t *check_zp = zp; + + is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zp)->v_type == VDIR)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(zp->z_zfsvfs, + zp->z_phys->zp_parent, &xzp)) != 0) { + return (error); + } + check_zp = xzp; + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + error = zfs_zaccess_common(check_zp, mode, &working_mode, cr); + + if (error == EROFS) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error || (working_mode != mode)) { + error = secpolicy_vnode_access(cr, ZTOV(check_zp), + check_zp->z_phys->zp_uid, ~zfs_v4_to_unix(working_mode)); + } + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Special zaccess function to check for special nfsv4 perm. + * doesn't call secpolicy_vnode_access() for failure, since that + * would probably be the wrong policy function to call. + * instead its up to the caller to handle that situation. + */ + +int +zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr) +{ + int working_mode = 0; + return (zfs_zaccess_common(zp, mode, &working_mode, cr)); +} + +/* + * Translate tradition unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, cr)); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + int dzp_working_mode = 0; + int zp_working_mode = 0; + int dzp_error, zp_error; + + /* + * Arghh, this check is going to require a couple of questions + * to be asked. We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:sloar:write_data:deny,user:sloar:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, cr); + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr); + + if (dzp_error == EROFS || zp_error == EROFS) + return (dzp_error); + + /* + * First handle the first row + */ + if (dzp_working_mode & ACE_DELETE_CHILD) + return (0); + + /* + * Second row + */ + + if (zp_working_mode & ACE_DELETE) + return (0); + + /* + * Third Row + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE, + &dzp_working_mode, cr); + + if (dzp_error == EROFS) + return (dzp_error); + + if (dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) + return (0); + + /* + * Fourth Row + */ + + if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) && + (zp_working_mode & ACE_DELETE)) + return (0); + + return (secpolicy_vnode_access(cr, ZTOV(zp), dzp->z_phys->zp_uid, + S_IWRITE|S_IEXEC)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if (error = zfs_zaccess_delete(sdzp, szp, cr)) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if (error = zfs_zaccess_delete(tdzp, tzp, cr)) + return (error); + } + + /* + * Now check for add permissions + */ + if (error = zfs_zaccess(sdzp, add_perm, cr)) + return (error); + + error = zfs_sticky_remove_access(sdzp, szp, cr); + + return (error); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_byteswap.c b/usr/src/uts/common/fs/zfs/zfs_byteswap.c new file mode 100644 index 0000000000..e1e857aa44 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_acl.h> + +void +zfs_ace_byteswap(ace_t *ace, int ace_cnt) +{ + int i; + + for (i = 0; i != ace_cnt; i++, ace++) { + ace->a_who = BSWAP_32(ace->a_who); + ace->a_access_mask = BSWAP_32(ace->a_access_mask); + ace->a_flags = BSWAP_16(ace->a_flags); + ace->a_type = BSWAP_16(ace->a_type); + } +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + int cnt; + + /* + * Arggh, since we don't know how many ACEs are in + * the array, we have to swap the entire block + */ + + cnt = size / sizeof (ace_t); + + zfs_ace_byteswap((ace_t *)buf, cnt); +} + +void +zfs_znode_byteswap(void *buf, size_t size) +{ + znode_phys_t *zp = buf; + + ASSERT(size >= sizeof (znode_phys_t)); + + zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); + zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); + zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); + zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); + zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); + zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); + zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); + zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); + zp->zp_gen = BSWAP_64(zp->zp_gen); + zp->zp_mode = BSWAP_64(zp->zp_mode); + zp->zp_size = BSWAP_64(zp->zp_size); + zp->zp_parent = BSWAP_64(zp->zp_parent); + zp->zp_links = BSWAP_64(zp->zp_links); + zp->zp_xattr = BSWAP_64(zp->zp_xattr); + zp->zp_rdev = BSWAP_64(zp->zp_rdev); + zp->zp_flags = BSWAP_64(zp->zp_flags); + zp->zp_uid = BSWAP_64(zp->zp_uid); + zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); + zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); + zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); + zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]); + + zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); + zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count); + zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); + zp->zp_acl.z_acl_state = BSWAP_16(zp->zp_acl.z_acl_state); + zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c new file mode 100644 index 0000000000..229b042c4a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c @@ -0,0 +1,936 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + */ + +#include <fs/fs_subr.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/gfs.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/mount.h> + +typedef struct { + char *se_name; + vnode_t *se_root; + avl_node_t se_node; +} zfs_snapentry_t; + +static int +snapentry_compare(const void *a, const void *b) +{ + const zfs_snapentry_t *sa = a; + const zfs_snapentry_t *sb = b; + int ret = strcmp(sa->se_name, sb->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +vnodeops_t *zfsctl_ops_root; +vnodeops_t *zfsctl_ops_snapdir; +vnodeops_t *zfsctl_ops_snapshot; + +static const fs_operation_def_t zfsctl_tops_root[]; +static const fs_operation_def_t zfsctl_tops_snapdir[]; +static const fs_operation_def_t zfsctl_tops_snapshot[]; + +static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); + +static gfs_opsvec_t zfsctl_opsvec[] = { + { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, + { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, + { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { NULL } +}; + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; + +/* + * Root directory elements. We have only a single static entry, 'snapshot'. + */ +static gfs_dirent_t zfsctl_root_entries[] = { + { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { NULL } +}; + +/* include . and .. in the calculation */ +#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ + sizeof (gfs_dirent_t)) + 1) + + +/* + * Initialize the various GFS pieces we'll need to create and manipulate .zfs + * directories. This is called from the ZFS init routine, and initializes the + * vnode ops vectors that we'll be using. + */ +void +zfsctl_init(void) +{ + VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); +} + +void +zfsctl_fini(void) +{ + /* + * Remove vfsctl vnode ops + */ + if (zfsctl_ops_root) + vn_freevnodeops(zfsctl_ops_root); + if (zfsctl_ops_snapdir) + vn_freevnodeops(zfsctl_ops_snapdir); + if (zfsctl_ops_snapshot) + vn_freevnodeops(zfsctl_ops_snapshot); + + zfsctl_ops_root = NULL; + zfsctl_ops_snapdir = NULL; + zfsctl_ops_snapshot = NULL; +} + +/* + * Return the inode number associated with the 'snapshot' directory. + */ +/* ARGSUSED */ +static ino64_t +zfsctl_root_inode_cb(vnode_t *vp, int index) +{ + ASSERT(index == 0); + return (ZFSCTL_INO_SNAPDIR); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the vfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + vnode_t *vp; + zfsctl_node_t *zcp; + + ASSERT(zfsvfs->z_ctldir == NULL); + + vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, + zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, + zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = ZFSCTL_INO_ROOT; + + /* + * We're only faking the fact that we have a root of a filesystem for + * the sake of the GFS interfaces. Undo the flag manipulation it did + * for us. + */ + vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); + + zfsvfs->z_ctldir = vp; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is + * unmounted, and there are no more references. Release the vnode, + * which will release the hold on the vfs structure. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + ASSERT(zfsvfs->z_ctldir->v_count == 1); + VN_RELE(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +vnode_t * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(zp->z_zfsvfs->z_ctldir); + return (zp->z_zfsvfs->z_ctldir); +} + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr) +{ + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, + cred_t *cr) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +/* ARGSUSED */ +static int +zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr) +{ + if (mode & VWRITE) + return (EACCES); + + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purly virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + vap->va_type = VDIR; + /* + * We live in the now. + */ + gethrestime(&now); + vap->va_mtime = vap->va_ctime = vap->va_atime = now; +} + +static int +zfsctl_common_fid(vnode_t *vp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; + uint64_t object = zcp->zc_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + return (ENOSPC); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/<snap> objectid(snap) + */ + +#define ZFSCTL_INO_SNAP(id) (id) + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + vap->va_nodeid = ZFSCTL_INO_ROOT; + vap->va_nlink = vap->va_size = NROOT_ENTRIES; + + zfsctl_common_getattr(vp, vap); + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(dvp->v_vfsp, vpp); + } else { + err = gfs_dir_lookup(dvp, nm, vpp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + +static const fs_operation_def_t zfsctl_tops_root[] = { + { VOPNAME_OPEN, zfsctl_common_open }, + { VOPNAME_CLOSE, zfsctl_common_close }, + { VOPNAME_IOCTL, fs_inval }, + { VOPNAME_GETATTR, zfsctl_root_getattr }, + { VOPNAME_ACCESS, zfsctl_common_access }, + { VOPNAME_READDIR, gfs_vop_readdir }, + { VOPNAME_LOOKUP, zfsctl_root_lookup }, + { VOPNAME_SEEK, fs_seek }, + { VOPNAME_INACTIVE, (fs_generic_func_p) gfs_vop_inactive }, + { VOPNAME_FID, zfsctl_common_fid }, + { NULL } +}; + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + (void) strcat(zname, "@"); + if (strlen(zname) + strlen(name) >= len) + return (ENAMETOOLONG); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + zfs_snapentry_t search, *sep; + avl_index_t where; + int err; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + + search.se_name = (char *)name; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) + return (ENOENT); + + ASSERT(vn_ismntpt(sep->se_root)); + + /* this will be dropped by dounmount() */ + if ((err = vn_vfswlock(sep->se_root)) != 0) + return (err); + + VN_HOLD(sep->se_root); + if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0) + return (err); + ASSERT(sep->se_root->v_count == 1); + gfs_vop_inactive(sep->se_root, cr); + + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + return (0); +} + + +static int +zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) +{ + avl_index_t where; + vfs_t *vfsp; + refstr_t *pathref; + char newpath[MAXNAMELEN]; + const char *oldpath; + char *tail; + int err; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + ASSERT(sep != NULL); + + vfsp = vn_mountedvfs(sep->se_root); + ASSERT(vfsp != NULL); + + if (err = vfs_lock(vfsp)) + return (err); + + /* + * Change the name in the AVL tree. + */ + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); + avl_insert(&sdp->sd_snaps, sep, where); + + /* + * Change the current mountpoint info: + * - update the tail of the mntpoint path + * - update the tail of the resource path + */ + pathref = vfs_getmntpoint(vfsp); + oldpath = refstr_value(pathref); + VERIFY((tail = strrchr(oldpath, '/')) != NULL); + ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN); + (void) strncpy(newpath, oldpath, tail - oldpath + 1); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setmntpoint(vfsp, newpath); + + pathref = vfs_getresource(vfsp); + oldpath = refstr_value(pathref); + VERIFY((tail = strrchr(oldpath, '@')) != NULL); + ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN); + (void) strncpy(newpath, oldpath, tail - oldpath + 1); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setresource(vfsp, newpath); + + vfs_unlock(vfsp); + return (0); +} + +static int +zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, + cred_t *cr) +{ + zfsctl_snapdir_t *sdp = sdvp->v_data; + zfs_snapentry_t search, *sep; + avl_index_t where; + char from[MAXNAMELEN], to[MAXNAMELEN]; + int err; + + VERIFY(zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from) == 0); + err = zfs_secpolicy_write(from, NULL, cr); + if (err) + return (err); + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdvp != tdvp) + return (EINVAL); + + if (strcmp(snm, tnm) == 0) + return (0); + + mutex_enter(&sdp->sd_lock); + + search.se_name = (char *)snm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + err = zfsctl_rename_snap(sdp, sep, tnm); + if (err) { + mutex_exit(&sdp->sd_lock); + return (err); + } + } + + + VERIFY(zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to) == 0); + err = dmu_objset_rename(from, to); + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + char snapname[MAXNAMELEN]; + int err; + + VERIFY(zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname) == 0); + err = zfs_secpolicy_write(snapname, NULL, cr); + if (err) + return (err); + + mutex_enter(&sdp->sd_lock); + + err = zfsctl_unmount_snap(dvp, name, 0, cr); + if (err) { + mutex_exit(&sdp->sd_lock); + return (err); + } + + err = dmu_objset_destroy(snapname); + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + objset_t *snap; + char snapname[MAXNAMELEN]; + char *mountpoint; + zfs_snapentry_t *sep, search; + struct mounta margs; + vfs_t *vfsp; + size_t mountpoint_len; + avl_index_t where; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) + return (0); + + /* + * If we get a recursive call, that means we got called + * from the domount() code while it was trying to look up the + * spec (which looks like a local path for zfs). We need to + * add some flag to domount() to tell it not to do this lookup. + */ + if (MUTEX_HELD(&sdp->sd_lock)) + return (ENOENT); + + ZFS_ENTER(zfsvfs); + + mutex_enter(&sdp->sd_lock); + search.se_name = (char *)nm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + *vpp = sep->se_root; + VN_HOLD(*vpp); + /* + * If the snapshot was unmounted behind our backs, remount it. + */ + if (!vn_ismntpt(*vpp)) + goto domount; + VERIFY(traverse(vpp) == 0); + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * The requested snapshot is not currently mounted, look it up. + */ + VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0); + if (dmu_objset_open(snapname, DMU_OST_ZFS, + DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (ENOENT); + } + + sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); + avl_insert(&sdp->sd_snaps, sep, where); + + dmu_objset_close(snap); +domount: + mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + + strlen("/.zfs/snapshot/") + strlen(nm) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", + refstr_value(dvp->v_vfsp->vfs_mntpt), nm); + + margs.spec = snapname; + margs.dir = mountpoint; + margs.flags = MS_SYSSPACE | MS_NOMNTTAB; + margs.fstype = "zfs"; + margs.dataptr = NULL; + margs.datalen = 0; + margs.optptr = NULL; + margs.optlen = 0; + + err = domount("zfs", &margs, *vpp, kcred, &vfsp); + ASSERT3U(err, ==, 0); + + kmem_free(mountpoint, mountpoint_len); + + VFS_RELE(vfsp); + + /* + * Fix up the root vnode. + */ + VERIFY(traverse(vpp) == 0); + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + (*vpp)->v_vfsp = zfsvfs->z_vfs; + (*vpp)->v_flag &= ~VROOT; + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + char snapname[MAXNAMELEN]; + uint64_t id, cookie; + + ZFS_ENTER(zfsvfs); + + cookie = *offp; + if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, + &cookie) == ENOENT) { + *eofp = 1; + ZFS_EXIT(zfsvfs); + return (0); + } + + (void) strcpy(dp->d_name, snapname); + dp->d_ino = ZFSCTL_INO_SNAP(id); + *nextp = cookie; + + ZFS_EXIT(zfsvfs); + + return (0); +} + +vnode_t * +zfsctl_mknode_snapdir(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_snapdir_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, + zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, + zfsctl_snapdir_readdir_cb, NULL); + sdp = vp->v_data; + sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; + mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&sdp->sd_snaps, snapentry_compare, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); + return (vp); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_snapdir_t *sdp = vp->v_data; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + ZFS_EXIT(zfsvfs); + + return (0); +} + +static void +zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr) +{ + zfsctl_snapdir_t *sdp = vp->v_data; + + ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); + mutex_destroy(&sdp->sd_lock); + avl_destroy(&sdp->sd_snaps); + gfs_vop_inactive(vp, cr); +} + +static const fs_operation_def_t zfsctl_tops_snapdir[] = { + { VOPNAME_OPEN, zfsctl_common_open }, + { VOPNAME_CLOSE, zfsctl_common_close }, + { VOPNAME_IOCTL, fs_inval }, + { VOPNAME_GETATTR, zfsctl_snapdir_getattr }, + { VOPNAME_ACCESS, zfsctl_common_access }, + { VOPNAME_RENAME, zfsctl_snapdir_rename }, + { VOPNAME_RMDIR, zfsctl_snapdir_remove }, + { VOPNAME_READDIR, gfs_vop_readdir }, + { VOPNAME_LOOKUP, zfsctl_snapdir_lookup }, + { VOPNAME_SEEK, fs_seek }, + { VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapdir_inactive }, + { VOPNAME_FID, zfsctl_common_fid }, + { NULL } +}; + +static vnode_t * +zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) +{ + vnode_t *vp; + zfsctl_node_t *zcp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, + zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = objset; + + return (vp); +} + +static void +zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr) +{ + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + vnode_t *dvp; + + VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + if (vp->v_count > 1) { + mutex_exit(&sdp->sd_lock); + return; + } + ASSERT(!vn_ismntpt(vp)); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + if (sep->se_root == vp) { + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + break; + } + sep = next; + } + ASSERT(sep != NULL); + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + gfs_vop_inactive(vp, cr); +} + + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static const fs_operation_def_t zfsctl_tops_snapshot[] = { + VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive, + NULL, NULL +}; + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfsctl_node_t *zcp; + zfs_snapentry_t *sep; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + zcp = vp->v_data; + if (zcp->zc_id == objsetid) + break; + + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + + if (sep != NULL) { + VN_HOLD(vp); + error = traverse(&vp); + if (error == 0) + *zfsvfsp = VTOZ(vp)->z_zfsvfs; + VN_RELE(vp); + } else { + error = EINVAL; + } + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + return (error); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *svp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, cr); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + svp = sep->se_root; + next = AVL_NEXT(&sdp->sd_snaps, sep); + + /* + * If this snapshot is not mounted, then it must + * have just been unmounted by somebody else, and + * will be cleaned up by zfsctl_snapdir_inactive(). + */ + if (vn_ismntpt(svp)) { + if ((error = vn_vfswlock(svp)) != 0) + goto out; + + VN_HOLD(svp); + error = dounmount(vn_mountedvfs(svp), fflags, cr); + if (error) { + VN_RELE(svp); + goto out; + } + + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + /* + * We can't use VN_RELE(), as that will try to + * invoke zfsctl_snapdir_inactive(), and that + * would lead to an attempt to re-grab the sd_lock. + */ + ASSERT3U(svp->v_count, ==, 1); + gfs_vop_inactive(svp, cr); + } + sep = next; + } +out: + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + return (error); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c new file mode 100644 index 0000000000..6df89ad0c4 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -0,0 +1,853 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/random.h> +#include <sys/policy.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/fs/zfs.h> +#include "fs/fs_subr.h" +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> + +/* + * Lock a directory entry. A dirlock on <dzp, name> protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t *dl; + uint64_t zoid; + int error; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || + zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) + return (EEXIST); + + /* + * Wait until there are no locks on this name. + */ + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_reap) { + mutex_exit(&dzp->z_lock); + return (ENOENT); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) + if (strcmp(name, dl->dl_name) == 0) + break; + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. He'll either see the old value, + * which is his, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + zoid = dzp->z_phys->zp_xattr; + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (EEXIST); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *vpp = ZTOV(dzp); + VN_HOLD(*vpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", vpp, NULL, 0, NULL, kcred); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + if (error == 0) + *vpp = ZTOV(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + *vpp = zfsctl_root(dzp); + } else { + error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED); + if (error == 0) { + *vpp = ZTOV(zp); + zfs_dirent_unlock(dl); + } + } + + return (error); +} + +static char * +zfs_dq_hexname(char namebuf[17], uint64_t x) +{ + char *name = &namebuf[16]; + const char digits[16] = "0123456789abcdef"; + + *name = '\0'; + do { + *--name = digits[x & 0xf]; + x >>= 4; + } while (x != 0); + + return (name); +} + +void +zfs_dq_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char obj_name[17]; + int error; + + ASSERT(zp->z_reap); + ASSERT3U(zp->z_phys->zp_links, ==, 0); + + error = zap_add(zfsvfs->z_os, zfsvfs->z_dqueue, + zfs_dq_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); + ASSERT3U(error, ==, 0); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t dl; + int skipped = 0; + int error; + + ASSERT(dzp->z_active == 0); + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, zap.za_first_integer, &xzp); + ASSERT3U(error, ==, 0); + + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, -1); + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + VN_RELE(ZTOV(xzp)); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + ASSERT3U(error, ==, 0); + dmu_tx_commit(tx); + + VN_RELE(ZTOV(xzp)); + } + ASSERT(error == ENOENT); + return (skipped); +} + +/* + * Special function to requeue the znodes for deletion that were + * in progress when we either crashed or umounted the file system. + */ +static void +zfs_drain_dq(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + /* + * Interate over the contents of the delete queue. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_dqueue); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * Need some helpers? + */ + if (zfs_delete_thread_target(zfsvfs, -1) != 0) + return; + + /* + * See what kind of object we have in queue + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these queue entries for reaping, + * so we pull them back into core and set zp->z_reap. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for reaping. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system to be processed by the taskq. + */ + if (error != 0) { + continue; + } + zp->z_reap = 1; + VN_RELE(ZTOV(zp)); + break; + } +} + +void +zfs_delete_thread(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zfs_delete_t *zd = &zfsvfs->z_delete_head; + znode_t *zp; + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, &zd->z_mutex, callb_generic_cpr, "zfs_delete"); + + mutex_enter(&zd->z_mutex); + + if (!zd->z_drained && !zd->z_draining) { + zd->z_draining = B_TRUE; + mutex_exit(&zd->z_mutex); + zfs_drain_dq(zfsvfs); + mutex_enter(&zd->z_mutex); + zd->z_draining = B_FALSE; + zd->z_drained = B_TRUE; + cv_broadcast(&zd->z_quiesce_cv); + } + + while (zd->z_thread_count <= zd->z_thread_target) { + zp = list_head(&zd->z_znodes); + if (zp == NULL) { + ASSERT(zd->z_znode_count == 0); + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&zd->z_cv, &zd->z_mutex); + CALLB_CPR_SAFE_END(&cprinfo, &zd->z_mutex); + continue; + } + ASSERT(zd->z_znode_count != 0); + list_remove(&zd->z_znodes, zp); + if (--zd->z_znode_count == 0) + cv_broadcast(&zd->z_quiesce_cv); + mutex_exit(&zd->z_mutex); + zfs_rmnode(zp); + (void) zfs_delete_thread_target(zfsvfs, -1); + mutex_enter(&zd->z_mutex); + } + + ASSERT(zd->z_thread_count != 0); + if (--zd->z_thread_count == 0) + cv_broadcast(&zd->z_cv); + + CALLB_CPR_EXIT(&cprinfo); /* NB: drops z_mutex */ + thread_exit(); +} + +static int zfs_work_per_thread_shift = 11; /* 2048 (2^11) per thread */ + +/* + * Set the target number of delete threads to 'nthreads'. + * If nthreads == -1, choose a number based on current workload. + * If nthreads == 0, don't return until the threads have exited. + */ +int +zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads) +{ + zfs_delete_t *zd = &zfsvfs->z_delete_head; + + mutex_enter(&zd->z_mutex); + + if (nthreads == -1) { + if (zd->z_thread_target == 0) { + mutex_exit(&zd->z_mutex); + return (EBUSY); + } + nthreads = zd->z_znode_count >> zfs_work_per_thread_shift; + nthreads = MIN(nthreads, ncpus << 1); + nthreads = MAX(nthreads, 1); + nthreads += !!zd->z_draining; + } + + zd->z_thread_target = nthreads; + + while (zd->z_thread_count < zd->z_thread_target) { + (void) thread_create(NULL, 0, zfs_delete_thread, zfsvfs, + 0, &p0, TS_RUN, minclsyspri); + zd->z_thread_count++; + } + + while (zd->z_thread_count > zd->z_thread_target && nthreads == 0) { + cv_broadcast(&zd->z_cv); + cv_wait(&zd->z_cv, &zd->z_mutex); + } + + mutex_exit(&zd->z_mutex); + + return (0); +} + +/* + * Wait until everything that's been queued has been deleted. + */ +void +zfs_delete_wait_empty(zfsvfs_t *zfsvfs) +{ + zfs_delete_t *zd = &zfsvfs->z_delete_head; + + mutex_enter(&zd->z_mutex); + ASSERT(zd->z_thread_target != 0); + while (!zd->z_drained || zd->z_znode_count != 0) { + ASSERT(zd->z_thread_target != 0); + cv_wait(&zd->z_quiesce_cv, &zd->z_mutex); + } + mutex_exit(&zd->z_mutex); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + char obj_name[17]; + dmu_tx_t *tx; + uint64_t acl_obj; + int error; + + ASSERT(zp->z_active == 0); + ASSERT(ZTOV(zp)->v_count == 0); + ASSERT(zp->z_phys->zp_links == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) + if (zfs_purgedir(zp) != 0) { + zfs_delete_t *delq = &zfsvfs->z_delete_head; + /* + * Add this back to the delete list to be retried later. + * + * XXX - this could just busy loop on us... + */ + mutex_enter(&delq->z_mutex); + list_insert_tail(&delq->z_znodes, zp); + delq->z_znode_count++; + mutex_exit(&delq->z_mutex); + return; + } + + /* + * If the file has extended attributes, unlink the xattr dir. + */ + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT(error == 0); + } + + acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + /* + * Set up the transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1); + if (xzp) { + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_delete_t *delq = &zfsvfs->z_delete_head; + + dmu_tx_abort(tx); + /* + * Add this back to the delete list to be retried later. + * + * XXX - this could just busy loop on us... + */ + mutex_enter(&delq->z_mutex); + list_insert_tail(&delq->z_znodes, zp); + delq->z_znode_count++; + mutex_exit(&delq->z_mutex); + return; + } + + if (xzp) { + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_reap = 1; /* mark xzp for deletion */ + xzp->z_phys->zp_links = 0; /* no more links to it */ + mutex_exit(&xzp->z_lock); + zfs_dq_add(xzp, tx); /* add xzp to delete queue */ + } + + /* + * Remove this znode from delete queue + */ + error = zap_remove(os, zfsvfs->z_dqueue, + zfs_dq_hexname(obj_name, zp->z_id), tx); + ASSERT3U(error, ==, 0); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xzp) + VN_RELE(ZTOV(xzp)); +} + +/* + * Link zp into dl. Can only fail if zp has been reaped. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + int error; + + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_reap) { /* no new links to reaped zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (ENOENT); + } + zp->z_phys->zp_links++; + } + zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + + if (!(flag & ZNEW)) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + mutex_exit(&zp->z_lock); + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size++; /* one dirent added */ + dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + 8, 1, &zp->z_id, tx); + ASSERT(error == 0); + + return (0); +} + +/* + * Unlink zp from dl, and mark zp for reaping if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'reaped_ptr' is NULL, we put reaped znodes on the delete queue. + * If it's non-NULL, we use it to indicate whether the znode needs reaping, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + int *reaped_ptr) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + int reaped = 0; + int error; + + if (!(flag & ZRENAMING)) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (vn_vfswlock(vp)) /* prevent new mounts on zp */ + return (EBUSY); + + if (vn_ismntpt(vp)) { /* don't remove mount point */ + vn_vfsunlock(vp); + return (EBUSY); + } + + mutex_enter(&zp->z_lock); + if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (EEXIST); + } + ASSERT(zp->z_phys->zp_links > zp_is_dir); + if (--zp->z_phys->zp_links == zp_is_dir) { + zp->z_reap = 1; + zp->z_phys->zp_links = 0; + reaped = 1; + } else { + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + } + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size--; /* one dirent removed */ + dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx); + ASSERT(error == 0); + + if (reaped_ptr != NULL) + *reaped_ptr = reaped; + else if (reaped) + zfs_dq_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + uint64_t xoid; + int error; + + *xvpp = NULL; + + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr)) + return (error); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0); + ASSERT(xzp->z_id == xoid); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_phys->zp_xattr = xoid; + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, ""); + dmu_tx_commit(tx); + + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + ASSERT(zp->z_phys->zp_xattr == 0); + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + zfs_dirent_unlock(dl); + return (EROFS); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | 0755; + va.va_uid = (uid_t)zp->z_phys->zp_uid; + va.va_gid = (gid_t)zp->z_phys->zp_gid; + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + + if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + return (0); + + if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 || + (uid = crgetuid(cr)) == zdp->z_phys->zp_uid || + uid == zp->z_phys->zp_uid || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c new file mode 100644 index 0000000000..e8723ffe89 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -0,0 +1,1323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/modctl.h> +#include <sys/open.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/vdev.h> +#include <sys/dmu.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/policy.h> +#include <sys/zone.h> +#include <sys/nvpair.h> +#include <sys/pathname.h> +#include <sys/mount.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ctldir.h> + +#include "zfs_namecheck.h" + +extern struct modlfs zfs_modlfs; + +extern void zfs_init(void); +extern void zfs_fini(void); + +ldi_ident_t zfs_li = NULL; +dev_info_t *zfs_dip; + +typedef int zfs_ioc_func_t(zfs_cmd_t *); +typedef int zfs_secpolicy_func_t(const char *, const char *, cred_t *); + +typedef struct zfs_ioc_vec { + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + enum { + no_name, + pool_name, + dataset_name + } zvec_namecheck; +} zfs_ioc_vec_t; + +/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char buf[256]; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + /* + * To get this data, use the zfs-dprintf probe as so: + * dtrace -q -n 'zfs-dprintf \ + * /stringof(arg0) == "dbuf.c"/ \ + * {printf("%s: %s", stringof(arg1), stringof(arg3))}' + * arg0 = file name + * arg1 = function name + * arg2 = line number + * arg3 = message + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(const char *unused1, const char *unused2, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(const char *dataset, const char *unused, cred_t *cr) +{ + if (INGLOBALZONE(curproc) || + zone_dataset_visible(dataset, NULL)) + return (0); + + return (ENOENT); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curproc) && + !zone_dataset_visible(dataset, &writable)) + return (ENOENT); + + if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) + return (ENOENT); + + if (INGLOBALZONE(curproc)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (EPERM); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (EPERM); + + /* must be writable by this zone */ + if (!writable) + return (EPERM); + } + return (0); +} + +/* + * Policy for dataset write operations (create children, set properties, etc). + * Requires SYS_MOUNT privilege, and must be writable in the local zone. + */ +/* ARGSUSED */ +int +zfs_secpolicy_write(const char *dataset, const char *unused, cred_t *cr) +{ + int error; + + if (error = zfs_dozonecheck(dataset, cr)) + return (error); + + return (secpolicy_zfs(cr)); +} + +/* + * Policy for operations that want to write a dataset's parent: + * create, destroy, snapshot, clone, restore. + */ +static int +zfs_secpolicy_parent(const char *dataset, const char *unused, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parentname, '/'); + if (cp == NULL) + return (ENOENT); + cp[0] = '\0'; + + } + + return (zfs_secpolicy_write(parentname, unused, cr)); +} + +/* + * Policy for dataset write operations (create children, set properties, etc). + * Requires SYS_MOUNT privilege, and must be writable in the local zone. + */ +static int +zfs_secpolicy_setprop(const char *dataset, const char *prop, cred_t *cr) +{ + int error; + + if (error = zfs_dozonecheck(dataset, cr)) + return (error); + + if (strcmp(prop, "zoned") == 0) { + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curproc)) + return (EPERM); + } + + return (secpolicy_zfs(cr)); +} + +/* + * Security policy for setting the quota. This is the same as + * zfs_secpolicy_write, except that the local zone may not change the quota at + * the zone-property setpoint. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_quota(const char *dataset, const char *unused, cred_t *cr) +{ + int error; + + if (error = zfs_dozonecheck(dataset, cr)) + return (error); + + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + char setpoint[MAXNAMELEN]; + int dslen; + /* + * Unprivileged users are allowed to modify the quota + * on things *under* (ie. contained by) the thing they + * own. + */ + if (dsl_prop_get_integer(dataset, "zoned", &zoned, setpoint)) + return (EPERM); + if (!zoned) /* this shouldn't happen */ + return (EPERM); + dslen = strlen(dataset); + if (dslen <= strlen(setpoint)) + return (EPERM); + } + + return (secpolicy_zfs(cr)); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (EPERM); + + return (0); +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_config(zfs_cmd_t *zc, nvlist_t **nvp) +{ + char *packed; + size_t size; + int error; + nvlist_t *config = NULL; + + /* + * Read in and unpack the user-supplied nvlist. By this point, we know + * that the user has the SYS_CONFIG privilege, so allocating arbitrary + * sized regions of memory should not be a problem. + */ + if ((size = zc->zc_config_src_size) == 0) + return (EINVAL); + + packed = kmem_alloc(size, KM_SLEEP); + + if ((error = xcopyin((void *)(uintptr_t)zc->zc_config_src, packed, + size)) != 0) { + kmem_free(packed, size); + return (error); + } + + if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) { + kmem_free(packed, size); + return (error); + } + + kmem_free(packed, size); + + *nvp = config; + return (0); +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config; + + if ((error = get_config(zc, &config)) != 0) + return (error); + + error = spa_create(zc->zc_name, config, zc->zc_root[0] == '\0' ? + NULL : zc->zc_root); + + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + return (spa_destroy(zc->zc_name)); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config; + uint64_t guid; + + if ((error = get_config(zc, &config)) != 0) + return (error); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_pool_guid) + error = EINVAL; + else + error = spa_import(zc->zc_name, config, + zc->zc_root[0] == '\0' ? NULL : zc->zc_root); + + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + return (spa_export(zc->zc_name)); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + char *packed = NULL; + size_t size = 0; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (EEXIST); + + VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0); + + if (size > zc->zc_config_dst_size) + error = ENOMEM; + else + error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst, + size); + + zc->zc_config_dst_size = size; + + kmem_free(packed, size); + nvlist_free(configs); + + return (error); +} + +static int +zfs_ioc_pool_guid(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + zc->zc_pool_guid = spa_guid(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + char *packed = NULL; + size_t size = 0; + int error; + + error = spa_get_stats(zc->zc_name, &config); + + if (config != NULL) { + VERIFY(nvlist_pack(config, &packed, &size, + NV_ENCODE_NATIVE, 0) == 0); + + if (size > zc->zc_config_dst_size) + error = ENOMEM; + else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst, + size)) + error = EFAULT; + + zc->zc_config_dst_size = size; + + kmem_free(packed, size); + nvlist_free(config); + } else { + ASSERT(error != 0); + } + + return (error); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config; + char *packed = NULL; + size_t size = 0; + int error; + + if ((error = get_config(zc, &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (EINVAL); + + VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0); + + if (size > zc->zc_config_dst_size) + error = ENOMEM; + else + error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst, + size); + + zc->zc_config_dst_size = size; + + kmem_free(packed, size); + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_scrub(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + if ((error = get_config(zc, &config)) == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + return (ENOTSUP); +} + +static int +zfs_ioc_vdev_online(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_prop_value; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + error = vdev_online(spa, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_offline(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_prop_value; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + error = vdev_offline(spa, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_prop_value; + int replacing = zc->zc_cookie; + nvlist_t *config; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + if ((error = get_config(zc, &config)) == 0) { + error = spa_vdev_attach(spa, path, config, replacing); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_prop_value; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_detach(spa, path, 0, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_get_stats(zfs_cmd_t *zc) +{ + char *name = zc->zc_name; + zfs_stats_t *zs = &zc->zc_zfs_stats; + int error; + + bzero(zs, sizeof (zfs_stats_t)); + + if ((error = dsl_prop_get_integer(name, "atime", + &zs->zs_atime, zs->zs_atime_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "recordsize", + &zs->zs_recordsize, zs->zs_recordsize_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "readonly", + &zs->zs_readonly, zs->zs_readonly_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "devices", + &zs->zs_devices, zs->zs_devices_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "setuid", + &zs->zs_setuid, zs->zs_setuid_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "exec", + &zs->zs_exec, zs->zs_exec_setpoint)) != 0 || + (error = dsl_prop_get_string(name, "mountpoint", zs->zs_mountpoint, + sizeof (zs->zs_mountpoint), zs->zs_mountpoint_setpoint)) != 0 || + (error = dsl_prop_get_string(name, "sharenfs", zs->zs_sharenfs, + sizeof (zs->zs_sharenfs), zs->zs_sharenfs_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "aclmode", + &zs->zs_acl_mode, zs->zs_acl_mode_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "snapdir", + &zs->zs_snapdir, zs->zs_snapdir_setpoint)) != 0 || + (error = dsl_prop_get_integer(name, "aclinherit", + &zs->zs_acl_inherit, zs->zs_acl_inherit_setpoint)) != 0) + return (error); + + return (0); +} + +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + +retry: + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + if (error != 0) { + /* + * This is ugly: dmu_objset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + return (error); + } + + dmu_objset_stats(os, &zc->zc_objset_stats); + + switch (zc->zc_objset_stats.dds_type) { + + case DMU_OST_ZFS: + error = zfs_get_stats(zc); + break; + + case DMU_OST_ZVOL: + error = zvol_get_stats(zc, os); + break; + } + + dmu_objset_close(os); + return (error); +} + +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + dsl_dir_t *dd; + zap_cursor_t cursor; + zap_attribute_t attr; + int error; + char *p; + + dd = dsl_dir_open(zc->zc_name, FTAG, NULL); + if (dd == NULL) + return (ESRCH); + + if (dd->dd_phys->dd_child_dir_zapobj == 0) { + dsl_dir_close(dd, FTAG); + return (ESRCH); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj, zc->zc_cookie); + + error = zap_cursor_retrieve(&cursor, &attr); + if (error == ENOENT) + error = ESRCH; + if (error != 0) { + dsl_dir_close(dd, FTAG); + *p = '\0'; + return (error); + } + + (void) strlcpy(p, attr.za_name, sizeof (zc->zc_name) - + (p - zc->zc_name)); + + zap_cursor_advance(&cursor); + zc->zc_cookie = zap_cursor_serialize(&cursor); + + } while (!INGLOBALZONE(curproc) && + !zone_dataset_visible(zc->zc_name, NULL)); + + dsl_dir_close(dd, FTAG); + + /* + * If it's a hidden dataset, don't try to get stats for it. + * User land will skip over it. + */ + if (strchr(zc->zc_name, '$') != NULL) + return (0); + + error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */ + return (error); +} + +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + zap_cursor_t cursor; + zap_attribute_t attr; + dsl_dataset_t *ds; + int error; + +retry: + error = dsl_dataset_open(zc->zc_name, + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds); + if (error) { + /* + * This is ugly: dsl_dataset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list -s". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + if (error == ENOENT) + return (ESRCH); + return (error); + } + + /* + * If ds_snapnames_zapobj is 0, someone is trying to iterate over + * snapshots of a snapshot. In this case, pretend that it has no + * snapshots; otherwise zap_cursor_retrieve() will blow up. + */ + if (ds->ds_phys->ds_snapnames_zapobj == 0) { + error = ESRCH; + goto out; + } + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, zc->zc_cookie); + + error = zap_cursor_retrieve(&cursor, &attr); + if (error == ENOENT) + error = ESRCH; + if (error != 0) + goto out; + + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= + sizeof (zc->zc_name) || + strlcat(zc->zc_name, attr.za_name, sizeof (zc->zc_name)) >= + sizeof (zc->zc_name)) { + error = ENAMETOOLONG; + goto out; + } + + zap_cursor_advance(&cursor); + zc->zc_cookie = zap_cursor_serialize(&cursor); + + error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */ + +out: + dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + return (error); +} + +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + return (dsl_prop_set(zc->zc_name, zc->zc_prop_name, + zc->zc_intsz, zc->zc_numints, zc->zc_prop_value)); +} + +static int +zfs_ioc_set_quota(zfs_cmd_t *zc) +{ + return (dsl_dir_set_quota(zc->zc_name, zc->zc_cookie)); +} + +static int +zfs_ioc_set_reservation(zfs_cmd_t *zc) +{ + return (dsl_dir_set_reservation(zc->zc_name, zc->zc_cookie)); +} + +static int +zfs_ioc_set_volsize(zfs_cmd_t *zc) +{ + return (zvol_set_volsize(zc)); +} + +static int +zfs_ioc_set_volblocksize(zfs_cmd_t *zc) +{ + return (zvol_set_volblocksize(zc)); +} + +static int +zfs_ioc_create_minor(zfs_cmd_t *zc) +{ + return (zvol_create_minor(zc)); +} + +static int +zfs_ioc_remove_minor(zfs_cmd_t *zc) +{ + return (zvol_remove_minor(zc)); +} + +/* + * Search the vfs list for a specified resource. Returns a pointer to it + * or NULL if no suitable entry is found. The caller of this routine + * is responsible for releasing the returned vfs pointer. + */ +static vfs_t * +zfs_get_vfs(const char *resource) +{ + struct vfs *vfsp; + struct vfs *vfs_found = NULL; + + vfs_list_read_lock(); + vfsp = rootvfs; + do { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { + VFS_HOLD(vfsp); + vfs_found = vfsp; + break; + } + vfsp = vfsp->vfs_next; + } while (vfsp != rootvfs); + vfs_list_unlock(); + return (vfs_found); +} + +static void +zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +{ + zfs_cmd_t *zc = arg; + zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx); +} + +static int +zfs_ioc_create(zfs_cmd_t *zc) +{ + objset_t *clone; + int error = 0; + void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + dmu_objset_type_t type = zc->zc_objset_type; + + switch (type) { + + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + return (EINVAL); + } + + if (zc->zc_filename[0] != '\0') { + /* + * We're creating a clone of an existing snapshot. + */ + zc->zc_filename[sizeof (zc->zc_filename) - 1] = '\0'; + if (dataset_namecheck(zc->zc_filename, NULL, NULL) != 0) + return (EINVAL); + + error = dmu_objset_open(zc->zc_filename, type, + DS_MODE_STANDARD | DS_MODE_READONLY, &clone); + if (error) + return (error); + error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL); + dmu_objset_close(clone); + } else if (strchr(zc->zc_name, '@') != 0) { + /* + * We're taking a snapshot of an existing dataset. + */ + error = dmu_objset_create(zc->zc_name, type, NULL, NULL, NULL); + } else { + /* + * We're creating a new dataset. + */ + if (type == DMU_OST_ZVOL) { + if ((error = zvol_check_volsize(zc)) != 0) + return (error); + if ((error = zvol_check_volblocksize(zc)) != 0) + return (error); + } + error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, zc); + } + return (error); +} + +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + if (strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + vfs_t *vfsp; + int err; + + /* + * Snapshots under .zfs control must be unmounted + * before they can be destroyed. + */ + if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) { + /* + * Always force the unmount for snapshots. + */ + int flag = MS_FORCE; + + if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + VFS_RELE(vfsp); + return (err); + } + VFS_RELE(vfsp); + if ((err = dounmount(vfsp, flag, kcred)) != 0) + return (err); + } + } + + return (dmu_objset_destroy(zc->zc_name)); +} + +static int +zfs_ioc_rollback(zfs_cmd_t *zc) +{ + return (dmu_objset_rollback(zc->zc_name)); +} + +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + zc->zc_prop_value[sizeof (zc->zc_prop_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_prop_value, NULL, NULL) != 0) + return (EINVAL); + + if (strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + vfs_t *vfsp; + int err; + + /* + * Snapshots under .zfs control must be unmounted + * before they can be renamed. + */ + if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) { + /* + * Always force the unmount for snapshots. + */ + int flag = MS_FORCE; + + if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + VFS_RELE(vfsp); + return (err); + } + VFS_RELE(vfsp); + if ((err = dounmount(vfsp, flag, kcred)) != 0) + return (err); + } + } + + return (dmu_objset_rename(zc->zc_name, zc->zc_prop_value)); +} + +static int +zfs_ioc_recvbackup(zfs_cmd_t *zc) +{ + file_t *fp; + int error, fd; + + fd = zc->zc_cookie; + fp = getf(fd); + if (fp == NULL) + return (EBADF); + error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie, + fp->f_vnode, fp->f_offset); + releasef(fd); + return (error); +} + +static int +zfs_ioc_sendbackup(zfs_cmd_t *zc) +{ + objset_t *fromsnap = NULL; + objset_t *tosnap; + file_t *fp; + int error; + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); + if (error) + return (error); + + if (zc->zc_prop_value[0] != '\0') { + error = dmu_objset_open(zc->zc_prop_value, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); + if (error) { + dmu_objset_close(tosnap); + return (error); + } + } + + fp = getf(zc->zc_cookie); + if (fp == NULL) { + dmu_objset_close(tosnap); + if (fromsnap) + dmu_objset_close(fromsnap); + return (EBADF); + } + + error = dmu_sendbackup(tosnap, fromsnap, fp->f_vnode); + + releasef(zc->zc_cookie); + if (fromsnap) + dmu_objset_close(fromsnap); + dmu_objset_close(tosnap); + return (error); +} + +static zfs_ioc_vec_t zfs_ioc_vec[] = { + { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name }, + { zfs_ioc_pool_guid, zfs_secpolicy_read, pool_name }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name }, + { zfs_ioc_set_prop, zfs_secpolicy_setprop, dataset_name }, + { zfs_ioc_set_quota, zfs_secpolicy_quota, dataset_name }, + { zfs_ioc_set_reservation, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_set_volsize, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_set_volblocksize, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name }, + { zfs_ioc_create, zfs_secpolicy_parent, dataset_name }, + { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name }, + { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_rename, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_sendbackup, zfs_secpolicy_write, dataset_name }, +}; + +static int +zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zfs_cmd_t *zc; + uint_t vec; + int error; + + if (getminor(dev) != 0) + return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); + + vec = cmd - ZFS_IOC; + + if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (EINVAL); + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); + + if (error == 0) { + zc->zc_cred = (uintptr_t)cr; + zc->zc_dev = dev; + error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, + zc->zc_prop_name, cr); + } + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + if (error == 0) { + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (zfs_ioc_vec[vec].zvec_namecheck) { + case pool_name: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case dataset_name: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + } + } + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_func(zc); + + if (error == 0 || error == ENOMEM) { + int rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); + if (error == 0) + error = rc; + } + + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static int +zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + zfs_dip = dip; + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (spa_busy() || zfs_busy() || zvol_busy()) + return (DDI_FAILURE); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + zfs_dip = NULL; + + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = zfs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + return (DDI_SUCCESS); + } + + return (DDI_FAILURE); +} + +/* + * OK, so this is a little weird. + * + * /dev/zfs is the control node, i.e. minor 0. + * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. + * + * /dev/zfs has basically nothing to do except serve up ioctls, + * so most of the standard driver entry points are in zvol.c. + */ +static struct cb_ops zfs_cb_ops = { + zvol_open, /* open */ + zvol_close, /* close */ + zvol_strategy, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + zvol_read, /* read */ + zvol_write, /* write */ + zfsdev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ + CB_REV, /* version */ + zvol_aread, /* async read */ + zvol_awrite, /* async write */ +}; + +static struct dev_ops zfs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + zfs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + zfs_attach, /* attach */ + zfs_detach, /* detach */ + nodev, /* reset */ + &zfs_cb_ops, /* driver operations */ + NULL /* no bus operations */ +}; + +static struct modldrv zfs_modldrv = { + &mod_driverops, "ZFS storage pool version 1", &zfs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&zfs_modlfs, + (void *)&zfs_modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) + return (error); + + error = ldi_ident_from_mod(&modlinkage, &zfs_li); + ASSERT(error == 0); + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + + return (0); +} + +int +_fini(void) +{ + int error; + + if (spa_busy() || zfs_busy() || zvol_busy()) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + zvol_fini(); + zfs_fini(); + spa_fini(); + + ldi_ident_release(zfs_li); + zfs_li = NULL; + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c new file mode 100644 index 0000000000..dbfd87f67a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_log.c @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/policy.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/acl.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/ddi.h> + +/* + * All the functions in this file are used to construct the log entries + * to record transactions. They allocate * a intent log transaction + * structure (itx_t) and save within it all the information necessary to + * possibly replay the transaction. The itx is then assigned a sequence + * number and inserted in the in-memory list anchored in the zilog. + */ + +/* + * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR + * transactions. + */ +uint64_t +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_rdev = zp->z_phys->zp_rdev; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + */ +uint64_t +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_link() handles TX_LINK transactions. + */ +uint64_t +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_symlink() handles TX_SYMLINK transactions. + */ +uint64_t +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zilog == NULL) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_rename() handles TX_RENAME transactions. + */ +uint64_t +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + uint64_t seq; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zilog == NULL) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + + seq = zil_itx_assign(zilog, itx, tx); + sdzp->z_last_itx = seq; + tdzp->z_last_itx = seq; + szp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_write() handles TX_WRITE transactions. + * + * We store data in the log buffers if it small enough. + * Otherwise we flush the data out via dmu_sync(). + */ +ssize_t zfs_immediate_write_sz = 65536; + +uint64_t +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio) +{ + itx_t *itx; + uint64_t seq; + lr_write_t *lr; + int dlen, err; + + if (zilog == NULL || zp->z_reap) + return (0); + + dlen = (len <= zfs_immediate_write_sz ? len : 0); + itx = zil_itx_create(txtype, sizeof (*lr) + dlen); + itx->itx_data_copied = 0; + if ((ioflag & FDSYNC) && (dlen != 0)) { + err = xcopyin(uio->uio_iov->iov_base - len, + (char *)itx + offsetof(itx_t, itx_lr) + sizeof (*lr), + len); + /* + * copyin shouldn't fault as we've already successfully + * copied it to a dmu buffer. However if it does we'll get + * the data from the dmu later. + */ + if (!err) + itx->itx_data_copied = 1; + } + lr = (lr_write_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zp->z_zfsvfs; + + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_truncate() handles TX_TRUNCATE transactions. + */ +uint64_t +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + uint64_t seq; + lr_truncate_t *lr; + + if (zilog == NULL || zp->z_reap) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_setattr() handles TX_SETATTR transactions. + */ +uint64_t +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied) +{ + itx_t *itx; + uint64_t seq; + lr_setattr_t *lr; + + if (zilog == NULL || zp->z_reap) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + lr->lr_uid = (uint64_t)vap->va_uid; + lr->lr_gid = (uint64_t)vap->va_gid; + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; + return (seq); +} + +/* + * zfs_log_acl() handles TX_ACL transactions. + */ +uint64_t +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, int aclcnt, ace_t *z_ace) +{ + itx_t *itx; + uint64_t seq; + lr_acl_t *lr; + + if (zilog == NULL || zp->z_reap) + return (0); + + itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t)); + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_aclcnt = (uint64_t)aclcnt; + bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t)); + + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; + return (seq); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c new file mode 100644 index 0000000000..cd5a3848cb --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_replay.c @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/vfs.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/spa.h> +#include <sys/zil.h> +#include <sys/byteorder.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/acl.h> +#include <sys/atomic.h> +#include <sys/cred.h> + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + bzero(vap, sizeof (*vap)); + vap->va_mask = (uint_t)mask; + vap->va_type = IFTOVT(mode); + vap->va_mode = mode & MODEMASK; + vap->va_uid = (uid_t)uid; + vap->va_gid = (gid_t)gid; + vap->va_rdev = (dev_t)rdev; + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +static int +zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_create_t */ + char *link; /* symlink content follows name */ + znode_t *dzp; + vnode_t *vp = NULL; + vattr_t va; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); + va.va_nblocks = lr->lr_gen; + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE: + error = VOP_CREATE(ZTOV(dzp), name, &va, 0, 0, &vp, kcred, 0); + break; + case TX_MKDIR: + error = VOP_MKDIR(ZTOV(dzp), name, &va, &vp, kcred); + break; + case TX_MKXATTR: + error = zfs_make_xattrdir(dzp, &va, &vp, kcred); + break; + case TX_SYMLINK: + link = name + strlen(name) + 1; + error = VOP_SYMLINK(ZTOV(dzp), name, &va, link, kcred); + break; + default: + error = ENOTSUP; + } + + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = VOP_REMOVE(ZTOV(dzp), name, kcred); + break; + case TX_RMDIR: + error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred); + break; + default: + error = ENOTSUP; + } + + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + VN_RELE(ZTOV(dzp)); + return (error); + } + + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred); + + VN_RELE(ZTOV(zp)); + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) +{ + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + VN_RELE(ZTOV(sdzp)); + return (error); + } + + error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred); + + VN_RELE(ZTOV(tdzp)); + VN_RELE(ZTOV(sdzp)); + + return (error); +} + +static int +zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + ssize_t resid; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, + lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) +{ + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + + error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + lr->lr_offset, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) +{ + znode_t *zp; + vattr_t va; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + va.va_size = lr->lr_size; + ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); + + error = VOP_SETATTR(ZTOV(zp), &va, 0, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* 0 no such transaction type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl, /* TX_ACL */ +}; diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c new file mode 100644 index 0000000000..502bcf39bf --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -0,0 +1,1072 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/acl.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/mntent.h> +#include <sys/mount.h> +#include <sys/cmn_err.h> +#include "fs/fs_subr.h" +#include <sys/zfs_znode.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/varargs.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/mkdev.h> +#include <sys/modctl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> + +int zfsfstype; +vfsops_t *zfs_vfsops = NULL; +static major_t zfs_major; +static minor_t zfs_minor; +static kmutex_t zfs_dev_mtx; + +static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); +static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); +static int zfs_root(vfs_t *vfsp, vnode_t **vpp); +static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); +static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); +static void zfs_freevfs(vfs_t *vfsp); +static void zfs_objset_close(zfsvfs_t *zfsvfs); + +static const fs_operation_def_t zfs_vfsops_template[] = { + VFSNAME_MOUNT, zfs_mount, + VFSNAME_UNMOUNT, zfs_umount, + VFSNAME_ROOT, zfs_root, + VFSNAME_STATVFS, zfs_statvfs, + VFSNAME_SYNC, (fs_generic_func_p) zfs_sync, + VFSNAME_VGET, zfs_vget, + VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs, + NULL, NULL +}; + +static const fs_operation_def_t zfs_vfsops_eio_template[] = { + VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs, + NULL, NULL +}; + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; +static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; + +static mntopt_t mntopts[] = { + { MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL }, + { MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL }, + { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } +}; + +static mntopts_t zfs_mntopts = { + sizeof (mntopts) / sizeof (mntopt_t), + mntopts +}; + +/*ARGSUSED*/ +int +zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * SYNC_ATTR is used by fsflush() to force old filesystems like UFS + * to sync metadata, which they would otherwise cache indefinitely. + * Semantically, the only requirement is that the sync be initiated. + * The DMU syncs out txgs frequently, so there's nothing to do. + */ + if (flag & SYNC_ATTR) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval < SPA_MINBLOCKSIZE || + newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) + newval = SPA_MAXBLOCKSIZE; + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->vfs_bsize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + (void) zfs_delete_thread_target(zfsvfs, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + (void) zfs_delete_thread_target(zfsvfs, 1); + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + objset_t *os = NULL; + struct dsl_dataset *ds; + char *osname; + uint64_t readonly, recordsize; + pathname_t spn; + dev_t mount_dev; + major_t new_major; + int mode; + int error = 0; + uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE; + int canwrite; + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * ZFS does not support passing unparsed data in via MS_DATA. + * Users should use the MS_OPTIONSTR interface; this means + * that all option parsing is already done and the options struct + * can be interrogated. + */ + if ((uap->flags & MS_DATA) && uap->datalen > 0) + return (EINVAL); + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (uap->flags & MS_REMOUNT) { + zfsvfs = vfsp->vfs_data; + + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + readonly_changed_cb(zfsvfs, B_TRUE); + else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + if (dmu_objset_is_snapshot(zfsvfs->z_os)) + return (EROFS); + readonly_changed_cb(zfsvfs, B_FALSE); + } + + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices_changed_cb(zfsvfs, B_FALSE); + setuid_changed_cb(zfsvfs, B_FALSE); + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + devices_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + devices_changed_cb(zfsvfs, B_TRUE); + + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + setuid_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + setuid_changed_cb(zfsvfs, B_TRUE); + } + + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + exec_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + exec_changed_cb(zfsvfs, B_TRUE); + + return (0); + } + + /* + * Get the objset name (the "special" mount argument). + */ + if (error = pn_get(uap->spec, fromspace, &spn)) + return (error); + + osname = spn.pn_path; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + goto out; + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero upto z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = vfsp; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_assign = TXG_NOWAIT; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = VISIBLE; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); + + /* + * Initialize the generic filesystem structure. + */ + vfsp->vfs_bcount = 0; + vfsp->vfs_data = NULL; + + /* + * Create a unique device for the mount. + */ + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + int start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + zfs_minor++; + if (zfs_minor > MAXMIN32) + zfs_minor = 0; + mount_dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(mount_dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers + * for the current major number. Create a + * new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique" + " major device number."); + goto out; + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + ASSERT(vfs_devismounted(mount_dev) == 0); + + if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0) + recordsize = SPA_MAXBLOCKSIZE; + + vfsp->vfs_dev = mount_dev; + vfsp->vfs_fstype = zfsfstype; + vfsp->vfs_bsize = recordsize; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfsp->vfs_data = zfsvfs; + + error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL); + if (error) + goto out; + + if (readonly) + mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + else + mode = DS_MODE_PRIMARY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (error == EROFS) { + mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, + &zfsvfs->z_os); + } + os = zfsvfs->z_os; + + if (error) + goto out; + + if (error = zfs_init_fs(zfsvfs, &zp, cr)) + goto out; + + if (dmu_objset_is_snapshot(os)) { + ASSERT(mode & DS_MODE_READONLY); + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + zfsvfs->z_issnap = B_TRUE; + } else { + int do_readonly = FALSE, readonly; + int do_setuid = FALSE, setuid; + int do_exec = FALSE, exec; + int do_devices = FALSE, devices; + + /* + * Start a delete thread running. + */ + (void) zfs_delete_thread_target(zfsvfs, 1); + + /* + * Parse and replay the intent log. + */ + zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector, + (void (*)(void *))zfs_delete_wait_empty); + + if (!zil_disable) + zfsvfs->z_log = zil_open(os, zfs_get_data); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices = B_FALSE; + setuid = B_FALSE; + do_devices = B_TRUE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } else if (vfs_optionisset(vfsp, + MNTOPT_DEVICES, NULL)) { + devices = B_TRUE; + do_devices = B_TRUE; + } + + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + + /* + * Register property callbacks. + */ + ds = dmu_objset_ds(os); + VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_register(ds, "aclinherit", + acl_inherit_changed_cb, zfsvfs) == 0); + + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + } + + vp = ZTOV(zp); + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + if (zp) + VN_RELE(vp); + + if (zfsvfs) { + if (os) + dmu_objset_close(os); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + } + } else { + atomic_add_32(&zfs_active_fs_count, 1); + VN_RELE(vp); + } + + pn_free(&spn); + return (error); +} + +static int +zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dmu_objset_stats_t dstats; + dev32_t d32; + + ZFS_ENTER(zfsvfs); + + dmu_objset_stats(zfsvfs->z_os, &dstats); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; + statp->f_bsize = zfsvfs->z_max_blksz; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = + (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree); + statp->f_favail = statp->f_ffree; /* no "root reservation" */ + statp->f_files = statp->f_ffree + dstats.dds_objects_used; + + (void) cmpldev(&d32, vfsp->vfs_dev); + statp->f_fsid = d32; + + /* + * We're a zfs filesystem. + */ + (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); + + statp->f_flag = 0; + + statp->f_namemax = ZFS_MAXNAMELEN; + + /* + * We have all of 32 characters to stuff a string here. + * Is there anything useful we could/should provide? + */ + bzero(statp->f_fstr, sizeof (statp->f_fstr)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + int ret; + + if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (ret); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL && + (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + + if (fflag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + zfsvfs->z_unmounted1 = B_TRUE; + + /* + * Wait for all zfs threads to leave zfs. + * Grabbing a rwlock as reader in all vops and + * as writer here doesn't work because it too easy to get + * multiple reader enters as zfs can re-enter itself. + * This can lead to deadlock if there is an intervening + * rw_enter as writer. + * So a file system threads ref count (z_op_cnt) is used. + * A polling loop on z_op_cnt may seem inefficient, but + * - this saves all threads on exit from having to grab a + * mutex in order to cv_signal + * - only occurs on forced unmount in the rare case when + * there are outstanding threads within the file system. + */ + while (zfsvfs->z_op_cnt) { + delay(1); + } + + zfs_objset_close(zfsvfs); + + return (0); + } + + zfs_zcache_flush(zfsvfs); + + /* + * Stop all delete threads. + */ + (void) zfs_delete_thread_target(zfsvfs, 0); + + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the number + * is off by 1 to indicate a hold on the vfs structure itself. + * + * The '.zfs' directory maintains a reference of its own, and any active + * references underneath are reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) { + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) + (void) zfs_delete_thread_target(zfsvfs, 1); + return (EBUSY); + } + } else { + if (vfsp->vfs_count > 2 || + (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) { + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) + (void) zfs_delete_thread_target(zfsvfs, 1); + return (EBUSY); + } + } + + vfsp->vfs_flag |= VFS_UNMOUNTED; + zfs_objset_close(zfsvfs); + + /* + * We can now safely destroy the '.zfs' directory node, which will + * release its hold on the vfs_t. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (EINVAL); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *vpp = zfsvfs->z_ctldir; + ASSERT(*vpp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, + 0, NULL, NULL) == 0); + } else { + VN_HOLD(*vpp); + } + ZFS_EXIT(zfsvfs); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if (err = zfs_zget(zfsvfs, object, &zp)) { + ZFS_EXIT(zfsvfs); + return (err); + } + zp_gen = zp->z_phys->zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_reap || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + VN_RELE(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + return (0); +} + +static void +zfs_objset_close(zfsvfs_t *zfsvfs) +{ + zfs_delete_t *zd = &zfsvfs->z_delete_head; + znode_t *zp, *nextzp; + objset_t *os = zfsvfs->z_os; + struct dsl_dataset *ds; + + /* + * Stop all delete threads. + */ + (void) zfs_delete_thread_target(zfsvfs, 0); + + /* + * For forced unmount, at this point all vops except zfs_inactive + * are erroring EIO. We need to now suspend zfs_inactive threads + * while we are freeing dbufs before switching zfs_inactive + * to use behaviour without a objset. + */ + rw_enter(&zfsvfs->z_um_lock, RW_WRITER); + + zfs_zcache_flush(zfsvfs); + + /* + * Release all delete in progress znodes + * They will be processed when the file system remounts. + */ + mutex_enter(&zd->z_mutex); + while (zp = list_head(&zd->z_znodes)) { + list_remove(&zd->z_znodes, zp); + zp->z_dbuf_held = 0; + dmu_buf_rele(zp->z_dbuf); + } + mutex_exit(&zd->z_mutex); + + /* + * Release all holds on dbufs + * Note, although we have stopped all other vop threads and + * zfs_inactive(), the dmu can callback via znode_pageout_func() + * which can zfs_znode_free() the znode. + * So we lock z_all_znodes; search the list for a held + * dbuf; drop the lock (we know zp can't disappear if we hold + * a dbuf lock; then regrab the lock and restart. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { + nextzp = list_next(&zfsvfs->z_all_znodes, zp); + if (zp->z_dbuf_held) { + /* dbufs should only be held when force unmounting */ + zp->z_dbuf_held = 0; + mutex_exit(&zfsvfs->z_znodes_lock); + dmu_buf_rele(zp->z_dbuf); + /* Start again */ + mutex_enter(&zfsvfs->z_znodes_lock); + nextzp = list_head(&zfsvfs->z_all_znodes); + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + ds = dmu_objset_ds(os); + + VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclinherit", + acl_inherit_changed_cb, zfsvfs) == 0); + } + + /* + * Make the dmu drop all it dbuf holds so that zfs_inactive + * can then safely free znode/vnodes. + */ + txg_wait_synced(dmu_objset_pool(os), 0); + + /* + * Switch zfs_inactive to behaviour without an objset. + * It just tosses cached pages and frees the znode & vnode. + * Then re-enable zfs_inactive threads in that new behaviour. + */ + zfsvfs->z_unmounted2 = B_TRUE; + rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ + + /* + * Close the zil. Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + /* + * Finally close the objset + */ + dmu_objset_close(os); + +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + + atomic_add_32(&zfs_active_fs_count, -1); +} + +/* + * VFS_INIT() initialization. Note that there is no VFS_FINI(), + * so we can't safely do any non-idempotent initialization here. + * Leave that to zfs_init() and zfs_fini(), which are called + * from the module's _init() and _fini() entry points. + */ +/*ARGSUSED*/ +static int +zfs_vfsinit(int fstype, char *name) +{ + int error; + + zfsfstype = fstype; + + /* + * Setup vfsops and vnodeops tables. + */ + error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); + if (error != 0) { + cmn_err(CE_WARN, "zfs: bad vfs ops template"); + } + + error = zfs_create_op_tables(); + if (error) { + zfs_remove_op_tables(); + cmn_err(CE_WARN, "zfs: bad vnode ops template"); + (void) vfs_freevfsops_by_type(zfsfstype); + return (error); + } + + mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + + /* + * unique major number for all zfs mounts + */ + if ((zfs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_vfsinit: Can't get unique device number."); + zfs_remove_op_tables(); + (void) vfs_freevfsops_by_type(zfsfstype); + return (error); + } + zfs_minor = 0; + + return (0); +} + +void +zfs_init(void) +{ + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); +} + +void +zfs_fini(void) +{ + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +static vfsdef_t vfw = { + VFSDEF_VERSION, + MNTTYPE_ZFS, + zfs_vfsinit, + VSW_HASPROTO | VSW_CANRWRO | VSW_CANREMOUNT | VSW_VOLATILEDEV, + &zfs_mntopts +}; + +struct modlfs zfs_modlfs = { + &mod_fsops, "ZFS filesystem version 1", &vfw +}; diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c new file mode 100644 index 0000000000..eb9964aa20 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -0,0 +1,3663 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/vmsystm.h> +#include <sys/atomic.h> +#include <vm/seg_vn.h> +#include <vm/pvn.h> +#include <vm/as.h> +#include <sys/mman.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/refcount.h> /* temporary for debugging purposes */ +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/filio.h> +#include "fs/fs_subr.h" +#include <sys/zfs_ctldir.h> + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait the the intent log to commit if it's is a synchronous operation. + * Morover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which requires z_grow_lock) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * + * (3) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). + * In normal operation, this will be TXG_NOWAIT. During ZIL replay, + * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * This is critical because we don't want to block while holding locks. + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing to + * use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call txg_wait_open(), and try again. + * + * (4) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * + * (5) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (6) After dropping all locks, invoke zil_commit(zilog, seq, ioflag) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * if (error) { + * dmu_tx_abort(tx); // abort DMU tx + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * txg_wait_open(dmu_objset_pool(os), 0); + * goto top; + * } + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * seq = zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, seq, ioflag); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, int cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + rw_enter(&zp->z_grow_lock, RW_READER); + file_sz = zp->z_phys->zp_size; + if (noff >= file_sz) { + rw_exit(&zp->z_grow_lock); + return (ENXIO); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + rw_exit(&zp->z_grow_lock); + + /* end of file? */ + if ((error == ESRCH) || (noff > file_sz)) { + /* + * Handle the virtual hole at the end of file. + */ + if (hole) { + *off = file_sz; + return (0); + } + return (ENXIO); + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, + int *rvalp) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + + switch (com) { + case _FIOFFS: + return (zfs_sync(vp->v_vfsp, 0, cred)); + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + if (ddi_copyin((void *)data, &off, sizeof (off), flag)) + return (EFAULT); + + zfsvfs = VTOZ(vp)->z_zfsvfs; + ZFS_ENTER(zfsvfs); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) + return (EFAULT); + return (0); + } + return (ENOTTY); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int64_t start, off; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + rw_exit(&zp->z_map_lock); + va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); + error = uiomove(va+off, bytes, UIO_WRITE, uio); + if (error == 0) { + dmu_write(zfsvfs->z_os, zp->z_id, + woff, bytes, va+off, tx); + } + ppmapout(va); + page_unlock(pp); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + woff, bytes, uio, tx); + rw_exit(&zp->z_map_lock); + } + len -= bytes; + woff += bytes; + off = 0; + if (error) + break; + } + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio) +{ + int64_t start, off, bytes; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + + bytes = MIN(PAGESIZE - off, len); + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); + error = uiomove(va + off, bytes, UIO_READ, uio); + ppmapout(va); + page_unlock(pp); + } else { + /* XXX use dmu_read here? */ + error = uiomove(addr, bytes, UIO_READ, uio); + } + len -= bytes; + addr += bytes; + off = 0; + if (error) + break; + } + return (error); +} + +uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 if success + * error code if failure + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t delta; + ssize_t n, size, cnt, ndone; + int error, i, numbufs; + dmu_buf_t *dbp, **dbpp; + + ZFS_ENTER(zfsvfs); + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Check for region locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (error = chklock(vp, FREAD, + uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + zil_commit(zfsvfs->z_log, zp->z_last_itx, ioflag & FRSYNC); + + /* + * Make sure nobody restructures the file (changes block size) + * in the middle of the read. + */ + rw_enter(&zp->z_grow_lock, RW_READER); + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_phys->zp_size) { + cnt = 0; + error = 0; + goto out; + } + + cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + + for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) { + ASSERT(uio->uio_loffset < zp->z_phys->zp_size); + n = MIN(zfs_read_chunk_size, + zp->z_phys->zp_size - uio->uio_loffset); + n = MIN(n, cnt); + dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id, + uio->uio_loffset, n, &numbufs); + if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) { + dmu_buf_rele_array(dbpp, numbufs); + goto out; + } + /* + * Compute the adjustment to align the dmu buffers + * with the uio buffer. + */ + delta = uio->uio_loffset - dbpp[0]->db_offset; + + for (i = 0; i < numbufs; i++) { + if (n < 0) + break; + dbp = dbpp[i]; + size = dbp->db_size - delta; + /* + * XXX -- this is correct, but may be suboptimal. + * If the pages are all clean, we don't need to + * go through mappedread(). Maybe the VMODSORT + * stuff can help us here. + */ + if (vn_has_cached_data(vp)) { + error = mappedread(vp, (caddr_t)dbp->db_data + + delta, (n < size ? n : size), uio); + } else { + error = uiomove((caddr_t)dbp->db_data + delta, + (n < size ? n : size), UIO_READ, uio); + } + if (error) { + dmu_buf_rele_array(dbpp, numbufs); + goto out; + } + n -= dbp->db_size; + if (delta) { + n += delta; + delta = 0; + } + } + dmu_buf_rele_array(dbpp, numbufs); + } +out: + rw_exit(&zp->z_grow_lock); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. + * Any error will exit this routine as this is only a best + * attempt to get the pages resident. This is a copy of ufs_trans_touch(). + */ +static void +zfs_prefault_write(ssize_t n, struct uio *uio) +{ + struct iovec *iov; + ulong_t cnt, incr; + caddr_t p; + uint8_t tmp; + + iov = uio->uio_iov; + + while (n) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) { + /* empty iov entry */ + iov++; + continue; + } + n -= cnt; + /* + * touch each page in this segment. + */ + p = iov->iov_base; + while (cnt) { + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + if (fuword8(p, &tmp)) + return; + break; + case UIO_SYSSPACE: + if (kcopy(p, &tmp, 1)) + return; + break; + } + incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* + * touch the last byte in case it straddles a page. + */ + p--; + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + if (fuword8(p, &tmp)) + return; + break; + case UIO_SYSSPACE: + if (kcopy(p, &tmp, 1)) + return; + break; + } + iov++; + } +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND flag set if in append mode. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + * + * Note: zfs_write() holds z_append_lock across calls to txg_wait_open(). + * It has to because of the semantics of FAPPEND. The implication is that + * we must never grab z_append_lock while in an assigned tx. + */ +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = uio->uio_llimit; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + offset_t woff; + ssize_t n, nbytes; + int max_blksz = zfsvfs->z_max_blksz; + int need_append_lock, error; + krw_t grow_rw = RW_READER; + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + n = start_resid; + + /* + * Fasttrack empty write + */ + if (n == 0) + return (0); + + ZFS_ENTER(zfsvfs); + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages don't hold up txg + */ + zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio); + + /* + * If in append mode, set the io offset pointer to eof. + */ + need_append_lock = ioflag & FAPPEND; + if (need_append_lock) { + rw_enter(&zp->z_append_lock, RW_WRITER); + woff = uio->uio_loffset = zp->z_phys->zp_size; + } else { + woff = uio->uio_loffset; + /* + * Validate file offset + */ + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If this write could change the file length, + * we need to synchronize with "appenders". + */ + if (woff < limit - n && woff + n > zp->z_phys->zp_size) { + need_append_lock = TRUE; + rw_enter(&zp->z_append_lock, RW_READER); + } + } + + if (woff >= limit) { + error = EFBIG; + goto no_tx_done; + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* + * Check for region locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) + goto no_tx_done; +top: + /* + * Make sure nobody restructures the file (changes block size) + * in the middle of the write. + */ + rw_enter(&zp->z_grow_lock, grow_rw); + + end_size = MAX(zp->z_phys->zp_size, woff + n); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + rw_exit(&zp->z_grow_lock); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + goto no_tx_done; + } + + if (end_size > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) { + uint64_t new_blksz; + /* + * This write will increase the file size beyond + * the current block size so increase the block size. + */ + if (grow_rw == RW_READER && !rw_tryupgrade(&zp->z_grow_lock)) { + dmu_tx_commit(tx); + rw_exit(&zp->z_grow_lock); + grow_rw = RW_WRITER; + goto top; + } + if (zp->z_blksz > max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end_size, max_blksz); + } + error = zfs_grow_blocksize(zp, new_blksz, tx); + if (error) { + tx_bytes = 0; + goto tx_done; + } + } + + if (grow_rw == RW_WRITER) { + rw_downgrade(&zp->z_grow_lock); + grow_rw = RW_READER; + } + + /* + * The file data does not fit in the znode "cache", so we + * will be writing to the file block data buffers. + * Each buffer will be written in a separate transaction; + * this keeps the intent log records small and allows us + * to do more fine-grained space accounting. + */ + while (n > 0) { + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + rw_enter(&zp->z_map_lock, RW_READER); + + tx_bytes = uio->uio_resid; + if (vn_has_cached_data(vp)) { + rw_exit(&zp->z_map_lock); + error = mappedwrite(vp, woff, nbytes, uio, tx); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + woff, nbytes, uio, tx); + rw_exit(&zp->z_map_lock); + } + tx_bytes -= uio->uio_resid; + + if (error) { + /* XXX - do we need to "clean up" the dmu buffer? */ + break; + } + + ASSERT(tx_bytes == nbytes); + + n -= nbytes; + if (n <= 0) + break; + + /* + * We have more work ahead of us, so wrap up this transaction + * and start another. Exact same logic as tx_done below. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + uio->uio_loffset); + } + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, + ioflag, uio); + dmu_tx_commit(tx); + + /* Pre-fault the next set of pages */ + zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio); + + /* + * Start another transaction. + */ + woff = uio->uio_loffset; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + rw_exit(&zp->z_grow_lock); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + goto no_tx_done; + } + } + +tx_done: + + if (tx_bytes != 0) { + /* + * Update the file size if it has changed; account + * for possible concurrent updates. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + uio->uio_loffset); + } + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, + ioflag, uio); + } + dmu_tx_commit(tx); + + rw_exit(&zp->z_grow_lock); + +no_tx_done: + + if (need_append_lock) + rw_exit(&zp->z_append_lock); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + zil_commit(zilog, seq, ioflag & (FSYNC | FDSYNC)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t off = lr->lr_offset; + int dlen = lr->lr_length; /* length of user data */ + int reclen = lr->lr_common.lrc_reclen; + int error = 0; + + ASSERT(dlen != 0); + + /* + * Nothing to do if the file has been removed or truncated. + */ + if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + return (ENOENT); + if (off >= zp->z_phys->zp_size || zp->z_reap) { + VN_RELE(ZTOV(zp)); + return (ENOENT); + } + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */ + rw_enter(&zp->z_grow_lock, RW_READER); + dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off); + dmu_buf_read(db); + bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen); + dmu_buf_rele(db); + rw_exit(&zp->z_grow_lock); + } else { + /* + * We have to grab z_grow_lock as RW_WRITER because + * dmu_sync() can't handle concurrent dbuf_dirty() (6313856). + * z_grow_lock will be replaced with a range lock soon, + * which will eliminate the concurrency hit, but dmu_sync() + * really needs more thought. It shouldn't have to rely on + * the caller to provide MT safety. + */ + rw_enter(&zp->z_grow_lock, RW_WRITER); + txg_suspend(dmu_objset_pool(os)); + error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff, + &lr->lr_blkptr, lr->lr_common.lrc_txg); + txg_resume(dmu_objset_pool(os)); + rw_exit(&zp->z_grow_lock); + } + VN_RELE(ZTOV(zp)); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_zaccess_rwx(zp, mode, cr); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr) +{ + + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { + VN_RELE(*vpp); + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory. + */ + + if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { + + /* + * Convert device special files + */ + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ +/* ARGSUSED */ +static int +zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + objset_t *os = zfsvfs->z_os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uint64_t zoid; + + ZFS_ENTER(zfsvfs); + +top: + *vpp = NULL; + + if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~VSVTX; + + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + VN_HOLD(dvp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible VN_HOLD(zp) */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { + if (strcmp(name, "..") == 0) + error = EISDIR; + ZFS_EXIT(zfsvfs); + return (error); + } + } + + zoid = zp ? zp->z_id : -1ULL; + + if (zp == NULL) { + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, 1); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + zfs_dirent_unlock(dl); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + ASSERT(zp->z_id == zoid); + (void) zfs_link_create(dl, zp, tx, ZNEW); + seq = zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); + dmu_tx_commit(tx); + } else { + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl == EXCL) { + error = EEXIST; + goto out; + } + /* + * Can't open a directory for writing. + */ + if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { + error = EISDIR; + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { + goto out; + } + /* + * Truncate regular files if requested. + */ + + /* + * Need to update dzp->z_seq? + */ + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + if ((ZTOV(zp)->v_type == VREG) && (zp->z_phys->zp_size != 0) && + (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + /* + * Truncate the file. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, zoid); + dmu_tx_hold_free(tx, zoid, 0, DMU_OBJECT_END); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + if (dl) + zfs_dirent_unlock(dl); + VN_RELE(ZTOV(zp)); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * Grab the grow_lock to serialize this change with + * respect to other file manipulations. + */ + rw_enter(&zp->z_grow_lock, RW_WRITER); + error = zfs_freesp(zp, 0, 0, mode, tx, cr); + if (error == 0) { + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + seq = zfs_log_truncate(zilog, tx, + TX_TRUNCATE, zp, 0, 0); + } + rw_exit(&zp->z_grow_lock); + dmu_tx_commit(tx); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + VN_RELE(ZTOV(zp)); + } else { + *vpp = ZTOV(zp); + /* + * If vnode is for a device return a specfs vnode instead. + */ + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + error = ENOSYS; + } + *vpp = svp; + } + } + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ +static int +zfs_remove(vnode_t *dvp, char *name, cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + znode_t *xzp = NULL; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + uint64_t acl_obj, xattr_obj; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int may_delete_now, delete_now = FALSE; + int reaped; + int error; + + ZFS_ENTER(zfsvfs); + +top: + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Check the restrictions that apply on sticky directories. + */ + if (error = zfs_sticky_remove_access(dzp, zp, cr)) + goto out; + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + vnevent_remove(vp); + + mutex_enter(&vp->v_lock); + may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); + mutex_exit(&vp->v_lock); + + /* + * We may delete the znode now, or we may put it on the delete queue; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, -1); + dmu_tx_hold_bonus(tx, zp->z_id); + if (may_delete_now) + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + + /* are there any extended attributes? */ + if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { + /* + * XXX - There is a possibility that the delete + * of the parent file could succeed, but then we get + * an ENOSPC when we try to delete the xattrs... + * so we would need to re-try the deletes periodically + */ + /* XXX - do we need this if we are deleting? */ + dmu_tx_hold_bonus(tx, xattr_obj); + } + + /* are there any additional acls */ + if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && + may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1); + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, 0, &reaped); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (reaped) { + mutex_enter(&vp->v_lock); + delete_now = may_delete_now && + vp->v_count == 1 && !vn_has_cached_data(vp) && + zp->z_phys->zp_xattr == xattr_obj && + zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; + mutex_exit(&vp->v_lock); + } + + if (delete_now) { + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT3U(error, ==, 0); + ASSERT3U(xzp->z_phys->zp_links, ==, 2); + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_reap = 1; + xzp->z_phys->zp_links = 0; + mutex_exit(&xzp->z_lock); + zfs_dq_add(xzp, tx); + zp->z_phys->zp_xattr = 0; /* probably unnecessary */ + } + mutex_enter(&zp->z_lock); + mutex_enter(&vp->v_lock); + vp->v_count--; + ASSERT3U(vp->v_count, ==, 0); + mutex_exit(&vp->v_lock); + zp->z_active = 0; + mutex_exit(&zp->z_lock); + zfs_znode_delete(zp, tx); + VFS_RELE(zfsvfs->z_vfs); + } else if (reaped) { + zfs_dq_add(zp, tx); + } + + seq = zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); + + dmu_tx_commit(tx); +out: + zfs_dirent_unlock(dl); + + if (!delete_now) { + VN_RELE(vp); + } else if (xzp) { + /* this rele delayed to prevent nesting transactions */ + VN_RELE(ZTOV(xzp)); + } + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +static int +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + zfs_dirlock_t *dl; + uint64_t zoid = 0; + dmu_tx_t *tx; + int error; + + ASSERT(vap->va_type == VDIR); + + ZFS_ENTER(zfsvfs); + + if (dzp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } +top: + *vpp = NULL; + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + seq = zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +static int +zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Check the restrictions that apply on sticky directories. + */ + if (error = zfs_sticky_remove_access(dzp, zp, cr)) + goto out; + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + if (vp == cwd) { + error = EINVAL; + goto out; + } + + vnevent_rmdir(vp); + + /* + * Grab a lock on the parent pointer make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + rw_exit(&zp->z_parent_lock); + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, 0, NULL); + + if (error == 0) + seq = zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); +out: + zfs_dirent_unlock(dl); + + VN_RELE(vp); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure. + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + ushort_t this_reclen; + uint64_t offset; /* must be unsigned; checks for < 1 */ + off64_t *next; + int local_eof; + int outcount = 0; + int error = 0; + + ZFS_ENTER(zfsvfs); + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_reap) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Initialize the iterator cursor. + */ + offset = uio->uio_loffset; + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id, + offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + odp = (struct dirent64 *)iovp->iov_base; + } + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_first_integer = zp->z_id; + this_reclen = DIRENT64_RECLEN(1); + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_first_integer = zp->z_phys->zp_parent; + this_reclen = DIRENT64_RECLEN(2); + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_first_integer = ZFSCTL_INO_ROOT; + this_reclen = + DIRENT64_RECLEN(sizeof (ZFS_CTLDIR_NAME) - 1); + } else { + /* + * Grab next entry. + */ + if (error = zap_cursor_retrieve(&zc, &zap)) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = ENXIO; + goto update; + } + this_reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + } + + /* + * Will this entry fit in the buffer? + */ + if (outcount + this_reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = EINVAL; + goto update; + } + break; + } + /* + * Add this entry: + */ + odp->d_ino = (ino64_t)zap.za_first_integer; + odp->d_reclen = (ushort_t)this_reclen; + /* NOTE: d_off is the offset for the *next* entry */ + next = &(odp->d_off); + (void) strncpy(odp->d_name, zap.za_name, + DIRENT64_NAMELEN(this_reclen)); + outcount += this_reclen; + odp = (dirent64_t *)((intptr_t)odp + this_reclen); + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + dmu_prefetch(zfsvfs->z_os, zap.za_first_integer, 0, 0); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + *next = offset; + } + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + zil_commit(zfsvfs->z_log, zp->z_last_itx, FSYNC); + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * flags - [UNUSED] + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_phys_t *pzp = zp->z_phys; + int error; + + ZFS_ENTER(zfsvfs); + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + mutex_enter(&zp->z_lock); + + vap->va_type = vp->v_type; + vap->va_mode = pzp->zp_mode & MODEMASK; + vap->va_uid = zp->z_phys->zp_uid; + vap->va_gid = zp->z_phys->zp_gid; + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; + vap->va_nodeid = zp->z_id; + vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ + vap->va_size = pzp->zp_size; + vap->va_rdev = pzp->zp_rdev; + vap->va_seq = zp->z_seq; + + ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + + /* + * Owner should be allowed to always read_attributes + */ + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { + if (zp->z_phys->zp_uid != crgetuid(cr)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + mutex_exit(&zp->z_lock); + + dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * flags - ATTR_UTIME set if non-default time values provided. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +static int +zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + struct znode *zp = VTOZ(vp); + znode_phys_t *pzp = zp->z_phys; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + dmu_tx_t *tx; + uint_t mask = vap->va_mask; + uint_t mask_applied = 0; + vattr_t oldva; + uint64_t new_mode; + int have_grow_lock; + int need_policy = FALSE; + int err; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (EINVAL); + + if (mask & AT_SIZE && vp->v_type == VDIR) + return (EISDIR); + + ZFS_ENTER(zfsvfs); + +top: + have_grow_lock = FALSE; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (EROFS); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME)) + need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (vap->va_mode & S_ISUID) != 0 && + (mask & AT_UID) != 0 && + vap->va_uid == 0) != 0) { + vap->va_mode = pzp->zp_mode; + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) + need_policy = TRUE; + + if (need_policy) { + mutex_enter(&zp->z_lock); + oldva.va_mode = pzp->zp_mode; + oldva.va_uid = zp->z_phys->zp_uid; + oldva.va_gid = zp->z_phys->zp_gid; + mutex_exit(&zp->z_lock); + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (mask & AT_MODE) { + + new_mode = (pzp->zp_mode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) + dmu_tx_hold_write(tx, + pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); + else + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); + } + + if (mask & AT_SIZE) { + uint64_t off = vap->va_size; + /* + * Grab the grow_lock to serialize this change with + * respect to other file manipulations. + */ + rw_enter(&zp->z_grow_lock, RW_WRITER); + have_grow_lock = TRUE; + if (off < zp->z_phys->zp_size) + dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END); + else if (zp->z_phys->zp_size && + zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz) + /* we will rewrite this block if we grow */ + dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size); + } + + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err) { + dmu_tx_abort(tx); + if (have_grow_lock) + rw_exit(&zp->z_grow_lock); + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (err); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + err = zfs_freesp(zp, vap->va_size, 0, 0, tx, cr); + if (err) { + mutex_enter(&zp->z_lock); + goto out; + } + mask_applied |= AT_SIZE; + } + + mask_applied = mask; /* no errors after this point */ + + mutex_enter(&zp->z_lock); + + if (mask & AT_MODE) { + err = zfs_acl_chmod_setattr(zp, new_mode, tx); + ASSERT3U(err, ==, 0); + } + + if ((mask & AT_UID) && vap->va_uid != oldva.va_uid) + zp->z_phys->zp_uid = (uint64_t)vap->va_uid; + + if ((mask & AT_GID) && vap->va_gid != oldva.va_gid) + zp->z_phys->zp_gid = (uint64_t)vap->va_gid; + + if (mask & AT_ATIME) + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + + if (mask & AT_MTIME) + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + + if (mask_applied & AT_SIZE) + zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); + else if (mask_applied != 0) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + +out: + if (mask_applied != 0) + seq = zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, + mask_applied); + + mutex_exit(&zp->z_lock); + + if (have_grow_lock) + rw_exit(&zp->z_grow_lock); + + dmu_tx_commit(tx); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = zp->z_zfsvfs->z_root; + uint64_t *oidp = &zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + rw_enter(rwlp, rw); + + if (*oidp == szp->z_id) /* We're a descendant of szp */ + return (EINVAL); + + if (*oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + oidp = &zp->z_phys->zp_parent; + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + VN_RELE(ZTOV(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +static int +zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) +{ + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = VTOZ(sdvp); + zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + vnode_t *realvp; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr, error; + + ZFS_ENTER(zfsvfs); + + /* + * Make sure we have the real vp for the target directory. + */ + if (VOP_REALVP(tdvp, &realvp) == 0) + tdvp = realvp; + + if (tdvp->v_vfsp != sdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + + tdzp = VTOZ(tdvp); +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != + (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + cmp = strcmp(snm, tnm); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + } + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); + terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); + } else { + terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + VN_RELE(ZTOV(tzp)); + } + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + VN_RELE(ZTOV(szp)); + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) + goto out; + + if (ZTOV(szp)->v_type == VDIR) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (ZTOV(szp)->v_type == VDIR) { + if (ZTOV(tzp)->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + } else { + if (ZTOV(tzp)->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + vnevent_rename_src(ZTOV(szp)); + if (tzp) + vnevent_rename_dest(ZTOV(tzp)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ + dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + if (sdzp != tdzp) { + dmu_tx_hold_zap(tx, sdzp->z_id, 1); + dmu_tx_hold_zap(tx, tdzp->z_id, 1); + dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ + } else { + dmu_tx_hold_zap(tx, sdzp->z_id, 2); + } + if (tzp) { + dmu_tx_hold_bonus(tx, tzp->z_id); /* nlink changes */ + } + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + ASSERT(error == 0); + seq = zfs_log_rename(zilog, tx, TX_RENAME, + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * target - Target path of new symlink. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +static int +zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + uint64_t zoid; + int len = strlen(link); + int error; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); +top: + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (ENAMETOOLONG); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, 1); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + + /* + * Create a new object for the symlink. + * Put the link content into bonus buffer if it will fit; + * otherwise, store it just like any other file data. + */ + zoid = 0; + if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); + if (len != 0) + bcopy(link, zp->z_phys + 1, len); + } else { + dmu_buf_t *dbp; + zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + + rw_enter(&zp->z_grow_lock, RW_WRITER); + error = zfs_grow_blocksize(zp, len, tx); + rw_exit(&zp->z_grow_lock); + if (error) + goto out; + + dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0); + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp); + } + zp->z_phys->zp_size = len; + + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); +out: + if (error == 0) + seq = zfs_log_symlink(zilog, tx, TX_SYMLINK, + dzp, zp, name, link); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + VN_RELE(ZTOV(zp)); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uoip - structure to contain the link path. + * cr - credentials of caller. + * + * OUT: uio - structure to contain the link path. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + size_t bufsz; + int error; + + ZFS_ENTER(zfsvfs); + + bufsz = (size_t)zp->z_phys->zp_size; + if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { + error = uiomove(zp->z_phys + 1, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0); + if ((error = dmu_buf_read_canfail(dbp)) != 0) { + dmu_buf_rele(dbp); + ZFS_EXIT(zfsvfs); + return (error); + } + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp); + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +static int +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + vnode_t *realvp; + int error; + + ASSERT(tdvp->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + + if (VOP_REALVP(svp, &realvp) == 0) + svp = realvp; + + if (svp->v_vfsp != tdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + + szp = VTOZ(svp); +top: + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_phys->zp_flags & ZFS_XATTR) != + (dzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && + secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, 1); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) + seq = zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * zfs_null_putapage() is used when the file system has been force + * unmounted. It just drops the pages. + */ +/* ARGSUSED */ +static int +zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); + return (0); +} + +/* ARGSUSED */ +static int +zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + dmu_tx_t *tx; + u_offset_t off; + ssize_t len; + caddr_t va; + int err; + +top: + rw_enter(&zp->z_grow_lock, RW_READER); + + off = pp->p_offset; + len = MIN(PAGESIZE, zp->z_phys->zp_size - off); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + dmu_tx_hold_bonus(tx, zp->z_id); + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err != 0) { + dmu_tx_abort(tx); + rw_exit(&zp->z_grow_lock); + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + goto out; + } + + va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); + + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + + ppmapout(va); + + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + seq = zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL); + dmu_tx_commit(tx); + + rw_exit(&zp->z_grow_lock); + + pvn_write_done(pp, B_WRITE | flags); + if (offp) + *offp = off; + if (lenp) + *lenp = len; + + zil_commit(zilog, seq, 0); +out: + return (err); +} + +/* + * Copy the portion of the file indicated from pages into the file. + * The pages are stored in a page list attached to the files vnode. + * + * IN: vp - vnode of file to push page data to. + * off - position in file to put data. + * len - amount of data to write. + * flags - flags to control the operation. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +static int +zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp; + size_t io_len; + u_offset_t io_off; + int error = 0; + + ZFS_ENTER(zfsvfs); + + ASSERT(zp->z_dbuf_held && zp->z_phys); + + if (len == 0) { + /* + * Search the entire vp list for pages >= off. + */ + error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, + flags, cr); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (off > zp->z_phys->zp_size) { + /* past end of file */ + ZFS_EXIT(zfsvfs); + return (0); + } + + len = MIN(len, zp->z_phys->zp_size - off); + + io_off = off; + while (io_off < off + len) { + if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { + pp = page_lookup(vp, io_off, + (flags & (B_INVAL | B_FREE)) ? + SE_EXCL : SE_SHARED); + } else { + pp = page_lookup_nowait(vp, io_off, + (flags & B_FREE) ? SE_EXCL : SE_SHARED); + } + + if (pp != NULL && pvn_getdirty(pp, flags)) { + int err; + + /* + * Found a dirty page to push + */ + if (err = + zfs_putapage(vp, pp, &io_off, &io_len, flags, cr)) + error = err; + } else { + io_len = PAGESIZE; + } + io_off += io_len; + } + ZFS_EXIT(zfsvfs); + return (error); +} + +void +zfs_inactive(vnode_t *vp, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_um_lock, RW_READER); + if (zfsvfs->z_unmounted2) { + ASSERT(zp->z_dbuf_held == 0); + + if (vn_has_cached_data(vp)) { + (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, + B_INVAL, cr); + } + + vp->v_count = 0; /* count arrives as 1 */ + zfs_znode_free(zp); + rw_exit(&zfsvfs->z_um_lock); + VFS_RELE(zfsvfs->z_vfs); + return; + } + + /* + * Attempt to push any data in the page cache. If this fails + * we will get kicked out later in zfs_zinactive(). + */ + if (vn_has_cached_data(vp)) + (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL, cr); + + if (zp->z_atime_dirty && zp->z_reap == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_um_lock); +} + +/* + * Bounds-check the seek operation. + * + * IN: vp - vnode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +static int +zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) +{ + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Pre-filter the generic locking function to trap attempts to place + * a mandatory lock on a memory mapped file. + */ +static int +zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + flk_callback_t *flk_cbp, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint_t cnt = 1; + int error; + + ZFS_ENTER(zfsvfs); + + /* + * If file is being mapped, disallow frlock. We set the mapcnt to + * -1 here to signal that we are in the process of setting a lock. + * This prevents a race with zfs_map(). + * XXX - well, sort of; since zfs_map() does not change z_mapcnt, + * we could be in the middle of zfs_map() and still call fs_frlock(). + * Also, we are doing no checking in zfs_addmap() (where z_mapcnt + * *is* manipulated). + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr); + ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1)); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * If we can't find a page in the cache, we will create a new page + * and fill it with file data. For efficiency, we may try to fill + * multiple pages as once (klustering). + */ +static int +zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +{ + znode_t *zp = VTOZ(vp); + page_t *pp, *cur_pp; + objset_t *os = zp->z_zfsvfs->z_os; + caddr_t va; + u_offset_t io_off, total; + uint64_t oid = zp->z_id; + size_t io_len; + int err; + + /* + * If we are only asking for a single page don't bother klustering. + */ + if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE || + off > zp->z_phys->zp_size) { + io_off = off; + io_len = PAGESIZE; + pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + } else { + /* + * Try to fill a kluster of pages (a blocks worth). + */ + size_t klen; + u_offset_t koff; + + if (!ISP2(zp->z_blksz)) { + /* Only one block in the file. */ + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = 0; + } else { + klen = plsz; + koff = P2ALIGN(off, (u_offset_t)klen); + } + if (klen > zp->z_phys->zp_size) + klen = P2ROUNDUP(zp->z_phys->zp_size, + (uint64_t)PAGESIZE); + pp = pvn_read_kluster(vp, off, seg, addr, &io_off, + &io_len, koff, klen, 0); + } + if (pp == NULL) { + /* + * Some other thread entered the page before us. + * Return to zfs_getpage to retry the lookup. + */ + *pl = NULL; + return (0); + } + + /* + * Fill the pages in the kluster. + */ + cur_pp = pp; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + ASSERT(io_off == cur_pp->p_offset); + va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1); + err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va); + ppmapout(va); + if (err) { + /* On error, toss the entire kluster */ + pvn_read_done(pp, B_ERROR); + return (err); + } + cur_pp = cur_pp->p_next; + } +out: + /* + * Fill in the page list array from the kluster. If + * there are too many pages in the kluster, return + * as many pages as possible starting from the desired + * offset `off'. + * NOTE: the page list will always be null terminated. + */ + pvn_plist_init(pp, pl, plsz, off, io_len, rw); + + return (0); +} + +/* + * Return pointers to the pages for the file region [off, off + len] + * in the pl array. If plsz is greater than len, this function may + * also return page pointers from before or after the specified + * region (i.e. some region [off', off' + plsz]). These additional + * pages are only returned if they are already in the cache, or were + * created as part of a klustered read. + * + * IN: vp - vnode of file to get data from. + * off - position in file to get data from. + * len - amount of data to retrieve. + * plsz - length of provided page list. + * seg - segment to obtain pages for. + * addr - virtual address of fault. + * rw - mode of created pages. + * cr - credentials of caller. + * + * OUT: protp - protection mode of created pages. + * pl - list of pages created. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp, **pl0 = pl; + int cnt = 0, need_unlock = 0, err = 0; + + ZFS_ENTER(zfsvfs); + + if (protp) + *protp = PROT_ALL; + + ASSERT(zp->z_dbuf_held && zp->z_phys); + + /* no faultahead (for now) */ + if (pl == NULL) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* can't fault past EOF */ + if (off >= zp->z_phys->zp_size) { + ZFS_EXIT(zfsvfs); + return (EFAULT); + } + + /* + * Make sure nobody restructures the file (changes block size) + * in the middle of the getpage. + */ + rw_enter(&zp->z_grow_lock, RW_READER); + + /* + * If we already own the lock, then we must be page faulting + * in the middle of a write to this file (i.e., we are writing + * to this file using data from a mapped region of the file). + */ + if (!rw_owner(&zp->z_map_lock)) { + rw_enter(&zp->z_map_lock, RW_WRITER); + need_unlock = TRUE; + } + + /* + * Loop through the requested range [off, off + len] looking + * for pages. If we don't find a page, we will need to create + * a new page and fill it with data from the file. + */ + while (len > 0) { + if (plsz < PAGESIZE) + break; + if (pp = page_lookup(vp, off, SE_SHARED)) { + *pl++ = pp; + off += PAGESIZE; + addr += PAGESIZE; + len -= PAGESIZE; + plsz -= PAGESIZE; + } else { + err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); + /* + * klustering may have changed our region + * to be block aligned. + */ + if (((pp = *pl) != 0) && (off != pp->p_offset)) { + int delta = off - pp->p_offset; + len += delta; + off -= delta; + addr -= delta; + } + while (*pl) { + pl++; + cnt++; + off += PAGESIZE; + addr += PAGESIZE; + plsz -= PAGESIZE; + if (len > PAGESIZE) + len -= PAGESIZE; + else + len = 0; + } + } + if (err) + goto out; + } + + /* + * Fill out the page array with any pages already in the cache. + */ + while (plsz > 0) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) + break; + *pl++ = pp; + off += PAGESIZE; + plsz -= PAGESIZE; + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); +out: + if (err) { + /* + * Release any pages we have locked. + */ + while (pl > pl0) + page_unlock(*--pl); + } + *pl = NULL; + + if (need_unlock) + rw_exit(&zp->z_map_lock); + rw_exit(&zp->z_grow_lock); + + ZFS_EXIT(zfsvfs); + return (err); +} + +static int +zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + segvn_crargs_t vn_a; + int error; + + ZFS_ENTER(zfsvfs); + + if (vp->v_flag & VNOMAP) { + ZFS_EXIT(zfsvfs); + return (ENOSYS); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (ENXIO); + } + + if (vp->v_type != VREG) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + /* + * If file is locked, disallow mapping. + * XXX - since we don't modify z_mapcnt here, there is nothing + * to stop a file lock being placed immediately after we complete + * this check. + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (vn_has_flocks(vp) || zp->z_mapcnt == -1) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + } + + as_rangelock(as); + if ((flags & MAP_FIXED) == 0) { + map_addr(addrp, len, off, 1, flags); + if (*addrp == NULL) { + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (ENOMEM); + } + } else { + /* + * User specified address - blow away any previous mappings + */ + (void) as_unmap(as, *addrp, len); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + + error = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +static int +zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) +{ + /* + * XXX - shouldn't we be checking for file locks here? + */ + ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0); + atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len)); + return (0); +} + +/* ARGSUSED */ +static int +zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) +{ + atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len)); + ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0); + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: vp - vnode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller [UNUSED]. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + * + * NOTE: This function is limited in that it will only permit space to + * be freed at the end of a file. In essence, this function simply + * allows one to set the file size. + */ +/* ARGSUSED */ +static int +zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + dmu_tx_t *tx; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t seq = 0; + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + +top: + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = convoff(vp, bfp, 0, offset)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + off = bfp->l_start; + len = bfp->l_len; + tx = dmu_tx_create(zfsvfs->z_os); + /* + * Grab the grow_lock to serialize this change with + * respect to other file size changes. + */ + dmu_tx_hold_bonus(tx, zp->z_id); + rw_enter(&zp->z_grow_lock, RW_WRITER); + if (off + len > zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz && + off >= zp->z_phys->zp_size) { + /* + * We are increasing the length of the file, + * and this may mean a block size increase. + */ + dmu_tx_hold_write(tx, zp->z_id, 0, + MIN(off + len, zfsvfs->z_max_blksz)); + } else if (off < zp->z_phys->zp_size) { + /* + * If len == 0, we are truncating the file. + */ + dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + dmu_tx_abort(tx); + rw_exit(&zp->z_grow_lock); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); + goto top; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_freesp(zp, off, len, flag, tx, cr); + + if (error == 0) { + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + } + + rw_exit(&zp->z_grow_lock); + + dmu_tx_commit(tx); + + zil_commit(zilog, seq, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_fid(vnode_t *vp, fid_t *fidp) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen = (uint32_t)zp->z_phys->zp_gen; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i; + + ZFS_ENTER(zfsvfs); + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + if (fidp->fid_len < size) { + fidp->fid_len = size; + return (ENOSPC); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) +{ + znode_t *zp, *xzp; + zfsvfs_t *zfsvfs; + zfs_dirlock_t *dl; + int error; + + switch (cmd) { + case _PC_LINK_MAX: + *valp = ULONG_MAX; + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + + case _PC_XATTR_EXISTS: + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + *valp = 0; + error = zfs_dirent_lock(&dl, zp, "", &xzp, + ZXATTR | ZEXISTS | ZSHARED); + if (error == 0) { + zfs_dirent_unlock(dl); + if (!zfs_dirempty(xzp)) + *valp = 1; + VN_RELE(ZTOV(xzp)); + } else if (error == ENOENT) { + /* + * If there aren't extended attributes, it's the + * same as having zero of them. + */ + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + + case _PC_ACL_ENABLED: + *valp = _ACL_ACE_ENABLED; + return (0); + + case _PC_MIN_HOLE_SIZE: + *valp = (ulong_t)SPA_MINBLOCKSIZE; + return (0); + + default: + return (fs_pathconf(vp, cmd, valp, cr)); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_getacl(zp, vsecp, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + error = zfs_setacl(zp, vsecp, cr); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Predeclare these here so that the compiler assumes that + * this is an "old style" function declaration that does + * not include arguments => we won't get type mismatch errors + * in the initializations that follow. + */ +static int zfs_inval(); +static int zfs_isdir(); + +static int +zfs_inval() +{ + return (EINVAL); +} + +static int +zfs_isdir() +{ + return (EISDIR); +} +/* + * Directory vnode operations template + */ +vnodeops_t *zfs_dvnodeops; +const fs_operation_def_t zfs_dvnodeops_template[] = { + VOPNAME_OPEN, zfs_open, + VOPNAME_CLOSE, zfs_close, + VOPNAME_READ, zfs_isdir, + VOPNAME_WRITE, zfs_isdir, + VOPNAME_IOCTL, zfs_ioctl, + VOPNAME_GETATTR, zfs_getattr, + VOPNAME_SETATTR, zfs_setattr, + VOPNAME_ACCESS, zfs_access, + VOPNAME_LOOKUP, zfs_lookup, + VOPNAME_CREATE, zfs_create, + VOPNAME_REMOVE, zfs_remove, + VOPNAME_LINK, zfs_link, + VOPNAME_RENAME, zfs_rename, + VOPNAME_MKDIR, zfs_mkdir, + VOPNAME_RMDIR, zfs_rmdir, + VOPNAME_READDIR, zfs_readdir, + VOPNAME_SYMLINK, zfs_symlink, + VOPNAME_FSYNC, zfs_fsync, + VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, + VOPNAME_FID, zfs_fid, + VOPNAME_SEEK, zfs_seek, + VOPNAME_PATHCONF, zfs_pathconf, + VOPNAME_GETSECATTR, zfs_getsecattr, + VOPNAME_SETSECATTR, zfs_setsecattr, + NULL, NULL +}; + +/* + * Regular file vnode operations template + */ +vnodeops_t *zfs_fvnodeops; +const fs_operation_def_t zfs_fvnodeops_template[] = { + VOPNAME_OPEN, zfs_open, + VOPNAME_CLOSE, zfs_close, + VOPNAME_READ, zfs_read, + VOPNAME_WRITE, zfs_write, + VOPNAME_IOCTL, zfs_ioctl, + VOPNAME_GETATTR, zfs_getattr, + VOPNAME_SETATTR, zfs_setattr, + VOPNAME_ACCESS, zfs_access, + VOPNAME_LOOKUP, zfs_lookup, + VOPNAME_RENAME, zfs_rename, + VOPNAME_FSYNC, zfs_fsync, + VOPNAME_INACTIVE, (fs_generic_func_p)zfs_inactive, + VOPNAME_FID, zfs_fid, + VOPNAME_SEEK, zfs_seek, + VOPNAME_FRLOCK, zfs_frlock, + VOPNAME_SPACE, zfs_space, + VOPNAME_GETPAGE, zfs_getpage, + VOPNAME_PUTPAGE, zfs_putpage, + VOPNAME_MAP, (fs_generic_func_p) zfs_map, + VOPNAME_ADDMAP, (fs_generic_func_p) zfs_addmap, + VOPNAME_DELMAP, zfs_delmap, + VOPNAME_PATHCONF, zfs_pathconf, + VOPNAME_GETSECATTR, zfs_getsecattr, + VOPNAME_SETSECATTR, zfs_setsecattr, + VOPNAME_VNEVENT, fs_vnevent_support, + NULL, NULL +}; + +/* + * Symbolic link vnode operations template + */ +vnodeops_t *zfs_symvnodeops; +const fs_operation_def_t zfs_symvnodeops_template[] = { + VOPNAME_GETATTR, zfs_getattr, + VOPNAME_SETATTR, zfs_setattr, + VOPNAME_ACCESS, zfs_access, + VOPNAME_RENAME, zfs_rename, + VOPNAME_READLINK, zfs_readlink, + VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, + VOPNAME_FID, zfs_fid, + VOPNAME_PATHCONF, zfs_pathconf, + VOPNAME_VNEVENT, fs_vnevent_support, + NULL, NULL +}; + +/* + * Extended attribute directory vnode operations template + * This template is identical to the directory vnodes + * operation template except for restricted operations: + * VOP_MKDIR() + * VOP_SYMLINK() + * Note that there are other restrictions embedded in: + * zfs_create() - restrict type to VREG + * zfs_link() - no links into/out of attribute space + * zfs_rename() - no moves into/out of attribute space + */ +vnodeops_t *zfs_xdvnodeops; +const fs_operation_def_t zfs_xdvnodeops_template[] = { + VOPNAME_OPEN, zfs_open, + VOPNAME_CLOSE, zfs_close, + VOPNAME_IOCTL, zfs_ioctl, + VOPNAME_GETATTR, zfs_getattr, + VOPNAME_SETATTR, zfs_setattr, + VOPNAME_ACCESS, zfs_access, + VOPNAME_LOOKUP, zfs_lookup, + VOPNAME_CREATE, zfs_create, + VOPNAME_REMOVE, zfs_remove, + VOPNAME_LINK, zfs_link, + VOPNAME_RENAME, zfs_rename, + VOPNAME_MKDIR, zfs_inval, + VOPNAME_RMDIR, zfs_rmdir, + VOPNAME_READDIR, zfs_readdir, + VOPNAME_SYMLINK, zfs_inval, + VOPNAME_FSYNC, zfs_fsync, + VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, + VOPNAME_FID, zfs_fid, + VOPNAME_SEEK, zfs_seek, + VOPNAME_PATHCONF, zfs_pathconf, + VOPNAME_GETSECATTR, zfs_getsecattr, + VOPNAME_SETSECATTR, zfs_setsecattr, + VOPNAME_VNEVENT, fs_vnevent_support, + NULL, NULL +}; + +/* + * Error vnode operations template + */ +vnodeops_t *zfs_evnodeops; +const fs_operation_def_t zfs_evnodeops_template[] = { + VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, + VOPNAME_PATHCONF, zfs_pathconf, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c new file mode 100644 index 0000000000..1ff11e29b8 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -0,0 +1,1286 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/resource.h> +#include <sys/mntent.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/unistd.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/atomic.h> +#include <vm/pvn.h> +#include "fs/fs_subr.h" +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/fs/zfs.h> + +struct kmem_cache *znode_cache = NULL; + +/* + * Note that znodes can be on one of 2 states: + * ZCACHE_mru - recently used, currently cached + * ZCACHE_mfu - frequently used, currently cached + * When there are no active references to the znode, they + * are linked onto one of the lists in zcache. These are the + * only znodes that can be evicted. + */ + +typedef struct zcache_state { + list_t list; /* linked list of evictable znodes in state */ + uint64_t lcnt; /* total number of znodes in the linked list */ + uint64_t cnt; /* total number of all znodes in this state */ + uint64_t hits; + kmutex_t mtx; +} zcache_state_t; + +/* The 2 states: */ +static zcache_state_t ZCACHE_mru; +static zcache_state_t ZCACHE_mfu; + +static struct zcache { + zcache_state_t *mru; + zcache_state_t *mfu; + uint64_t p; /* Target size of mru */ + uint64_t c; /* Target size of cache */ + uint64_t c_max; /* Maximum target cache size */ + + /* performance stats */ + uint64_t missed; + uint64_t evicted; + uint64_t skipped; +} zcache; + +void zcache_kmem_reclaim(void); + +#define ZCACHE_MINTIME (hz>>4) /* 62 ms */ + +/* + * Move the supplied znode to the indicated state. The mutex + * for the znode must be held by the caller. + */ +static void +zcache_change_state(zcache_state_t *new_state, znode_t *zp) +{ + /* ASSERT(MUTEX_HELD(hash_mtx)); */ + ASSERT(zp->z_active); + + if (zp->z_zcache_state) { + ASSERT3U(zp->z_zcache_state->cnt, >=, 1); + atomic_add_64(&zp->z_zcache_state->cnt, -1); + } + atomic_add_64(&new_state->cnt, 1); + zp->z_zcache_state = new_state; +} + +static void +zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_phys); + ASSERT(zp->z_dbuf_held); + + zp->z_dbuf_held = 0; + mutex_exit(&zp->z_lock); + dmu_buf_rele(zp->z_dbuf); + mutex_exit(hash_mtx); + VFS_RELE(zfsvfs->z_vfs); +} + +/* + * Evict znodes from list until we've removed the specified number + */ +static void +zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs) +{ + int znodes_evicted = 0; + znode_t *zp, *zp_prev; + kmutex_t *hash_mtx; + + ASSERT(state == zcache.mru || state == zcache.mfu); + + mutex_enter(&state->mtx); + + for (zp = list_tail(&state->list); zp; zp = zp_prev) { + zp_prev = list_prev(&state->list, zp); + if (zfsvfs && zp->z_zfsvfs != zfsvfs) + continue; + hash_mtx = ZFS_OBJ_MUTEX(zp); + if (mutex_tryenter(hash_mtx)) { + mutex_enter(&zp->z_lock); + list_remove(&zp->z_zcache_state->list, zp); + zp->z_zcache_state->lcnt -= 1; + ASSERT3U(zp->z_zcache_state->cnt, >=, 1); + atomic_add_64(&zp->z_zcache_state->cnt, -1); + zp->z_zcache_state = NULL; + zp->z_zcache_access = 0; + /* drops z_lock and hash_mtx */ + zfs_zcache_evict(zp, hash_mtx); + znodes_evicted += 1; + atomic_add_64(&zcache.evicted, 1); + if (znodes_evicted >= cnt) + break; + } else { + atomic_add_64(&zcache.skipped, 1); + } + } + mutex_exit(&state->mtx); + + if (znodes_evicted < cnt) + dprintf("only evicted %lld znodes from %x", + (longlong_t)znodes_evicted, state); +} + +static void +zcache_adjust(void) +{ + uint64_t mrucnt = zcache.mru->lcnt; + uint64_t mfucnt = zcache.mfu->lcnt; + uint64_t p = zcache.p; + uint64_t c = zcache.c; + + if (mrucnt > p) + zcache_evict_state(zcache.mru, mrucnt - p, NULL); + + if (mfucnt > 0 && mrucnt + mfucnt > c) { + int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c); + zcache_evict_state(zcache.mfu, toevict, NULL); + } +} + +/* + * Flush all *evictable* data from the cache. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +zfs_zcache_flush(zfsvfs_t *zfsvfs) +{ + zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs); + zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs); +} + +static void +zcache_try_grow(int64_t cnt) +{ + int64_t size; + /* + * If we're almost to the current target cache size, + * increment the target cache size + */ + size = zcache.mru->lcnt + zcache.mfu->lcnt; + if ((zcache.c - size) <= 1) { + atomic_add_64(&zcache.c, cnt); + if (zcache.c > zcache.c_max) + zcache.c = zcache.c_max; + else if (zcache.p + cnt < zcache.c) + atomic_add_64(&zcache.p, cnt); + } +} + +/* + * This routine is called whenever a znode is accessed. + */ +static void +zcache_access(znode_t *zp, kmutex_t *hash_mtx) +{ + ASSERT(MUTEX_HELD(hash_mtx)); + + if (zp->z_zcache_state == NULL) { + /* + * This znode is not in the cache. + * Add the new znode to the MRU state. + */ + + zcache_try_grow(1); + + ASSERT(zp->z_zcache_access == 0); + zp->z_zcache_access = lbolt; + zcache_change_state(zcache.mru, zp); + mutex_exit(hash_mtx); + + /* + * If we are using less than 2/3 of our total target + * cache size, bump up the target size for the MRU + * list. + */ + if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) { + zcache.p = zcache.mru->lcnt + zcache.c/6; + } + + zcache_adjust(); + + atomic_add_64(&zcache.missed, 1); + } else if (zp->z_zcache_state == zcache.mru) { + /* + * This znode has been "accessed" only once so far, + * Move it to the MFU state. + */ + if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + zp->z_zcache_access = lbolt; + zcache_change_state(zcache.mfu, zp); + } + atomic_add_64(&zcache.mru->hits, 1); + mutex_exit(hash_mtx); + } else { + ASSERT(zp->z_zcache_state == zcache.mfu); + /* + * This buffer has been accessed more than once. + * Keep it in the MFU state. + */ + atomic_add_64(&zcache.mfu->hits, 1); + mutex_exit(hash_mtx); + } +} + +static void +zcache_init(void) +{ + zcache.c = 20; + zcache.c_max = 50; + + zcache.mru = &ZCACHE_mru; + zcache.mfu = &ZCACHE_mfu; + + list_create(&zcache.mru->list, sizeof (znode_t), + offsetof(znode_t, z_zcache_node)); + list_create(&zcache.mfu->list, sizeof (znode_t), + offsetof(znode_t, z_zcache_node)); +} + +static void +zcache_fini(void) +{ + zfs_zcache_flush(NULL); + + list_destroy(&zcache.mru->list); + list_destroy(&zcache.mfu->list); +} + +/*ARGSUSED*/ +static void +znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) +{ + znode_t *zp = user_ptr; + vnode_t *vp = ZTOV(zp); + + if (vp->v_count == 0) { + vn_invalid(vp); + zfs_znode_free(zp); + } +} + +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + znode_t *zp = buf; + + zp->z_vnode = vn_alloc(KM_SLEEP); + zp->z_vnode->v_data = (caddr_t)zp; + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + zp->z_dbuf_held = 0; + zp->z_dirlocks = 0; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *cdarg) +{ + znode_t *zp = buf; + + ASSERT(zp->z_dirlocks == 0); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_map_lock); + rw_destroy(&zp->z_grow_lock); + rw_destroy(&zp->z_append_lock); + mutex_destroy(&zp->z_acl_lock); + + ASSERT(zp->z_dbuf_held == 0); + ASSERT(ZTOV(zp)->v_count == 0); + vn_free(ZTOV(zp)); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + + zcache_init(); +} + +void +zfs_znode_fini(void) +{ + zcache_fini(); + + /* + * Cleanup vfs & vnode ops + */ + zfs_remove_op_tables(); + + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; +} + +struct vnodeops *zfs_dvnodeops; +struct vnodeops *zfs_fvnodeops; +struct vnodeops *zfs_symvnodeops; +struct vnodeops *zfs_xdvnodeops; +struct vnodeops *zfs_evnodeops; + +void +zfs_remove_op_tables() +{ + /* + * Remove vfs ops + */ + ASSERT(zfsfstype); + (void) vfs_freevfsops_by_type(zfsfstype); + zfsfstype = 0; + + /* + * Remove vnode ops + */ + if (zfs_dvnodeops) + vn_freevnodeops(zfs_dvnodeops); + if (zfs_fvnodeops) + vn_freevnodeops(zfs_fvnodeops); + if (zfs_symvnodeops) + vn_freevnodeops(zfs_symvnodeops); + if (zfs_xdvnodeops) + vn_freevnodeops(zfs_xdvnodeops); + if (zfs_evnodeops) + vn_freevnodeops(zfs_evnodeops); + + zfs_dvnodeops = NULL; + zfs_fvnodeops = NULL; + zfs_symvnodeops = NULL; + zfs_xdvnodeops = NULL; + zfs_evnodeops = NULL; +} + +extern const fs_operation_def_t zfs_dvnodeops_template[]; +extern const fs_operation_def_t zfs_fvnodeops_template[]; +extern const fs_operation_def_t zfs_xdvnodeops_template[]; +extern const fs_operation_def_t zfs_symvnodeops_template[]; +extern const fs_operation_def_t zfs_evnodeops_template[]; + +int +zfs_create_op_tables() +{ + int error; + + /* + * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() + * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). + * In this case we just return as the ops vectors are already set up. + */ + if (zfs_dvnodeops) + return (0); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, + &zfs_dvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, + &zfs_fvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, + &zfs_symvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, + &zfs_xdvnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, + &zfs_evnodeops); + + return (error); +} + +/* + * zfs_init_fs - Initialize the zfsvfs struct and the file system + * incore "master" object. Verify version compatibility. + */ +int +zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) +{ + extern int zfsfstype; + + objset_t *os = zfsvfs->z_os; + uint64_t zoid; + uint64_t version = ZFS_VERSION; + int i, error; + dmu_object_info_t doi; + dmu_objset_stats_t *stats; + + *zpp = NULL; + + /* + * XXX - hack to auto-create the pool root filesystem at + * the first attempted mount. + */ + if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { + dmu_tx_t *tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */ + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ + error = dmu_tx_assign(tx, TXG_WAIT); + ASSERT3U(error, ==, 0); + zfs_create_fs(os, cr, tx); + dmu_tx_commit(tx); + } + + if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) { + return (EINVAL); + } else if (version != ZFS_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %lld on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)version, ZFS_VERSION); + return (ENOTSUP); + } + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP); + dmu_objset_stats(os, stats); + ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0); + zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid; + zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) | + zfsfstype & 0xFF; + kmem_free(stats, sizeof (dmu_objset_stats_t)); + stats = NULL; + + if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) { + return (EINVAL); + } + ASSERT(zoid != 0); + zfsvfs->z_root = zoid; + + /* + * Create the per mount vop tables. + */ + + /* + * Initialize zget mutex's + */ + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) + return (error); + ASSERT3U((*zpp)->z_id, ==, zoid); + + if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) { + return (EINVAL); + } + + zfsvfs->z_dqueue = zoid; + + /* + * Initialize delete head structure + * Thread(s) will be started/stopped via + * readonly_changed_cb() depending + * on whether this is rw/ro mount. + */ + list_create(&zfsvfs->z_delete_head.z_znodes, + sizeof (znode_t), offsetof(znode_t, z_list_node)); + + return (0); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) +{ + znode_t *zp; + vnode_t *vp; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + + ASSERT(zp->z_dirlocks == NULL); + + zp->z_phys = db->db_data; + zp->z_zfsvfs = zfsvfs; + zp->z_active = 1; + zp->z_reap = 0; + zp->z_atime_dirty = 0; + zp->z_dbuf_held = 0; + zp->z_mapcnt = 0; + zp->z_last_itx = 0; + zp->z_dbuf = db; + zp->z_id = obj_num; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + + bzero(&zp->z_zcache_node, sizeof (list_node_t)); + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + vp = ZTOV(zp); + vn_reinit(vp); + + vp->v_vfsp = zfsvfs->z_parent->z_vfs; + vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); + + switch (vp->v_type) { + case VDIR: + if (zp->z_phys->zp_flags & ZFS_XATTR) { + vn_setops(vp, zfs_xdvnodeops); + vp->v_flag |= V_XATTRDIR; + } else + vn_setops(vp, zfs_dvnodeops); + break; + case VBLK: + case VCHR: + vp->v_rdev = (dev_t)zp->z_phys->zp_rdev; + /*FALLTHROUGH*/ + case VFIFO: + case VSOCK: + case VDOOR: + vn_setops(vp, zfs_fvnodeops); + break; + case VREG: + vp->v_flag |= VMODSORT; + vn_setops(vp, zfs_fvnodeops); + break; + case VLNK: + vn_setops(vp, zfs_symvnodeops); + break; + default: + vn_setops(vp, zfs_evnodeops); + break; + } + + return (zp); +} + +static void +zfs_znode_dmu_init(znode_t *zp) +{ + znode_t *nzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_buf_t *db = zp->z_dbuf; + + mutex_enter(&zp->z_lock); + + nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func); + + /* + * there should be no + * concurrent zgets on this object. + */ + ASSERT3P(nzp, ==, NULL); + + /* + * Slap on VROOT if we are the root znode + */ + if (zp->z_id == zfsvfs->z_root) { + ZTOV(zp)->v_flag |= VROOT; + } + + zp->z_zcache_state = NULL; + zp->z_zcache_access = 0; + + ASSERT(zp->z_dbuf_held == 0); + zp->z_dbuf_held = 1; + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&zp->z_lock); + vn_exists(ZTOV(zp)); +} + +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * IS_REPLAY - intent log replay + * + * OUT: oid - ID of created object + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, int bonuslen) +{ + dmu_buf_t *dbp; + znode_phys_t *pzp; + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + timestruc_t now; + uint64_t gen; + int err; + + ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + *oid = vap->va_nodeid; + flag |= IS_REPLAY; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + } else { + *oid = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + } + + /* + * Create a new DMU object. + */ + if (vap->va_type == VDIR) { + if (flag & IS_REPLAY) { + err = zap_create_claim(zfsvfs->z_os, *oid, + DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + *oid = zap_create(zfsvfs->z_os, + DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } else { + if (flag & IS_REPLAY) { + err = dmu_object_claim(zfsvfs->z_os, *oid, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + ASSERT3U(err, ==, 0); + } else { + *oid = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + } + } + dbp = dmu_bonus_hold(zfsvfs->z_os, *oid); + dmu_buf_will_dirty(dbp, tx); + + /* + * Initialize the znode physical data to zero. + */ + ASSERT(dbp->db_size >= sizeof (znode_phys_t)); + bzero(dbp->db_data, dbp->db_size); + pzp = dbp->db_data; + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_phys = pzp; + dzp->z_id = *oid; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp->z_phys->zp_flags & ZFS_XATTR) + flag |= IS_XATTR; + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + pzp->zp_rdev = vap->va_rdev; + } + + if (vap->va_type == VDIR) { + pzp->zp_size = 2; /* contents ("." and "..") */ + pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } + + pzp->zp_parent = dzp->z_id; + if (flag & IS_XATTR) + pzp->zp_flags |= ZFS_XATTR; + + pzp->zp_gen = gen; + + ZFS_TIME_ENCODE(&now, pzp->zp_crtime); + ZFS_TIME_ENCODE(&now, pzp->zp_ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + } else { + ZFS_TIME_ENCODE(&now, pzp->zp_mtime); + } + + pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); + zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); + + zfs_perm_init(zp, dzp, flag, vap, tx, cr); + + if (zpp) { + kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); + + mutex_enter(hash_mtx); + zfs_znode_dmu_init(zp); + zcache_access(zp, hash_mtx); + *zpp = zp; + } else { + ZTOV(zp)->v_count = 0; + dmu_buf_rele(dbp); + zfs_znode_free(zp); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + + *zpp = NULL; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + db = dmu_bonus_hold(zfsvfs->z_os, obj_num); + if (db == NULL) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + dmu_buf_read(db); + + ASSERT(db->db_object == obj_num); + ASSERT(db->db_offset == -1); + ASSERT(db->db_data != NULL); + + zp = dmu_buf_get_user(db); + + if (zp != NULL) { + mutex_enter(&zp->z_lock); + + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_reap) { + dmu_buf_rele(db); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (ENOENT); + } else if (zp->z_dbuf_held) { + dmu_buf_rele(db); + } else { + zp->z_dbuf_held = 1; + VFS_HOLD(zfsvfs->z_vfs); + } + + if (zp->z_active == 0) { + zp->z_active = 1; + if (list_link_active(&zp->z_zcache_node)) { + mutex_enter(&zp->z_zcache_state->mtx); + list_remove(&zp->z_zcache_state->list, zp); + zp->z_zcache_state->lcnt -= 1; + mutex_exit(&zp->z_zcache_state->mtx); + } + } + VN_HOLD(ZTOV(zp)); + mutex_exit(&zp->z_lock); + zcache_access(zp, ZFS_OBJ_MUTEX(zp)); + *zpp = zp; + return (0); + } + + /* + * Not found create new znode/vnode + */ + zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); + ASSERT3U(zp->z_id, ==, obj_num); + zfs_znode_dmu_init(zp); + zcache_access(zp, ZFS_OBJ_MUTEX(zp)); + *zpp = zp; + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + ASSERT3U(error, ==, 0); + } + if (zp->z_zcache_state) { + ASSERT3U(zp->z_zcache_state->cnt, >=, 1); + atomic_add_64(&zp->z_zcache_state->cnt, -1); + } + error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); + ASSERT3U(error, ==, 0); + zp->z_dbuf_held = 0; + ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); + dmu_buf_rele(zp->z_dbuf); +} + +void +zfs_zinactive(znode_t *zp) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_dbuf_held && zp->z_phys); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + mutex_enter(&vp->v_lock); + vp->v_count--; + if (vp->v_count > 0 || vn_has_cached_data(vp)) { + /* + * If the hold count is greater than zero, somebody has + * obtained a new reference on this znode while we were + * processing it here, so we are done. If we still have + * mapped pages then we are also done, since we don't + * want to inactivate the znode until the pages get pushed. + * + * XXX - if vn_has_cached_data(vp) is true, but count == 0, + * this seems like it would leave the znode hanging with + * no chance to go inactive... + */ + mutex_exit(&vp->v_lock); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + return; + } + mutex_exit(&vp->v_lock); + zp->z_active = 0; + + /* + * If this was the last reference to a file with no links, + * remove the file from the file system. + */ + if (zp->z_reap) { + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + ASSERT3U(zp->z_zcache_state->cnt, >=, 1); + atomic_add_64(&zp->z_zcache_state->cnt, -1); + zp->z_zcache_state = NULL; + /* XATTR files are not put on the delete queue */ + if (zp->z_phys->zp_flags & ZFS_XATTR) { + zfs_rmnode(zp); + } else { + mutex_enter(&zfsvfs->z_delete_head.z_mutex); + list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp); + zfsvfs->z_delete_head.z_znode_count++; + cv_broadcast(&zfsvfs->z_delete_head.z_cv); + mutex_exit(&zfsvfs->z_delete_head.z_mutex); + } + VFS_RELE(zfsvfs->z_vfs); + return; + } + + /* + * If the file system for this znode is no longer mounted, + * evict the znode now, don't put it in the cache. + */ + if (zfsvfs->z_unmounted1) { + zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp)); + return; + } + + /* put znode on evictable list */ + mutex_enter(&zp->z_zcache_state->mtx); + list_insert_head(&zp->z_zcache_state->list, zp); + zp->z_zcache_state->lcnt += 1; + mutex_exit(&zp->z_zcache_state->mtx); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + mutex_enter(&zfsvfs->z_znodes_lock); + list_remove(&zfsvfs->z_all_znodes, zp); + mutex_exit(&zfsvfs->z_znodes_lock); + + kmem_cache_free(znode_cache, zp); +} + +void +zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + timestruc_t now; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + + gethrestime(&now); + + if (tx) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); + + if (flag & AT_MTIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); + + if (flag & AT_CTIME) + ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); +} + +/* + * Update the requested znode timestamps with the current time. + * If we are in a transaction, then go ahead and mark the znode + * dirty in the transaction so the timestamps will go to disk. + * Otherwise, we will get pushed next time the znode is updated + * in a transaction, or when this znode eventually goes inactive. + * + * Why is this OK? + * 1 - Only the ACCESS time is ever updated outside of a transaction. + * 2 - Multiple consecutive updates will be collapsed into a single + * znode update by the transaction grouping semantics of the DMU. + */ +void +zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) +{ + mutex_enter(&zp->z_lock); + zfs_time_stamper_locked(zp, flag, tx); + mutex_exit(&zp->z_lock); +} + +/* + * Grow the block size for a file. This may involve migrating data + * from the bonus buffer into a data block (when we grow beyond the + * bonus buffer data area). + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * RETURN: 0 if success + * error code if failure + * + * NOTE: this function assumes that the znode is write locked. + */ +int +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + ASSERT(rw_write_held(&zp->z_grow_lock)); + + if (size <= zp->z_blksz) + return (0); + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) + return (0); + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + if (error == ENOTSUP) + return (0); + ASSERT3U(error, ==, 0); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); + + return (0); +} + +/* + * This is a dummy interface used when pvn_vplist_dirty() should *not* + * be calling back into the fs for a putpage(). E.g.: when truncating + * a file, the pages being "thrown away* don't need to be written out. + */ +/* ARGSUSED */ +static int +zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, + int flags, cred_t *cr) +{ + ASSERT(0); + return (0); +} + +/* + * Free space in a file. Currently, this function only + * supports freeing space at the end of the file. + * + * IN: zp - znode of file to free data in. + * from - start of section to free. + * len - length of section to free (0 => to EOF). + * flag - current file open mode flags. + * tx - open transaction. + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, + cred_t *cr) +{ + vnode_t *vp = ZTOV(zp); + uint64_t size = zp->z_phys->zp_size; + uint64_t end = from + len; + int have_grow_lock, error; + + have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock); + + /* + * Nothing to do if file already at desired length. + */ + if (len == 0 && size == from) { + return (0); + } + + /* + * Check for any locks in the region to be freed. + */ + if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { + uint64_t start; + + if (size > from) + start = from; + else + start = size; + if (error = chklock(vp, FWRITE, start, 0, flag, NULL)) + return (error); + } + + if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + uint64_t new_blksz; + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + error = zfs_grow_blocksize(zp, new_blksz, tx); + ASSERT(error == 0); + } + if (end > size || len == 0) + zp->z_phys->zp_size = end; + if (from > size) + return (0); + + if (have_grow_lock) + rw_downgrade(&zp->z_grow_lock); + /* + * Clear any mapped pages in the truncated region. + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (vn_has_cached_data(vp)) { + page_t *pp; + uint64_t start = from & PAGEMASK; + int off = from & PAGEOFFSET; + + if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { + /* + * We need to zero a partial page. + */ + pagezero(pp, off, PAGESIZE - off); + start += PAGESIZE; + page_unlock(pp); + } + error = pvn_vplist_dirty(vp, start, zfs_no_putpage, + B_INVAL | B_TRUNC, cr); + ASSERT(error == 0); + } + rw_exit(&zp->z_map_lock); + + if (!have_grow_lock) + rw_enter(&zp->z_grow_lock, RW_READER); + + if (len == 0) + len = -1; + else if (end > size) + len = size - from; + dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx); + + if (!have_grow_lock) + rw_exit(&zp->z_grow_lock); + + return (0); +} + + +void +zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) +{ + zfsvfs_t zfsvfs; + uint64_t moid, doid, roid = 0; + uint64_t version = ZFS_VERSION; + int error; + znode_t *rootzp = NULL; + vnode_t *vp; + vattr_t vattr; + + /* + * First attempt to create master node. + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + + error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx); + ASSERT(error == 0); + + /* + * Create a delete queue. + */ + doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = 0; + vattr.va_gid = 3; + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + rootzp->z_zfsvfs = &zfsvfs; + rootzp->z_active = 1; + rootzp->z_reap = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_dbuf_held = 0; + + vp = ZTOV(rootzp); + vn_reinit(vp); + vp->v_type = VDIR; + + bzero(&zfsvfs, sizeof (zfsvfs_t)); + + zfsvfs.z_os = os; + zfsvfs.z_assign = TXG_NOWAIT; + zfsvfs.z_parent = &zfsvfs; + + mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); + ASSERT3U(rootzp->z_id, ==, roid); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); + ASSERT(error == 0); + + ZTOV(rootzp)->v_count = 0; + kmem_cache_free(znode_cache, rootzp); +} diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c new file mode 100644 index 0000000000..1adc8ca3df --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -0,0 +1,1242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/arc.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/dsl_dataset.h> +#include <sys/vdev.h> + + +/* + * The zfs intent log (ZIL) saves transaction records of system calls + * that change the file system in memory with enough information + * to be able to replay them. These are stored in memory until + * either the DMU transaction group (txg) commits them to the stable pool + * and they can be discarded, or they are flushed to the stable log + * (also in the pool) due to a fsync, O_DSYNC or other synchronous + * requirement. In the event of a panic or power fail then those log + * records (transactions) are replayed. + * + * There is one ZIL per file system. Its on-disk (pool) format consists + * of 3 parts: + * + * - ZIL header + * - ZIL blocks + * - ZIL records + * + * A log record holds a system call transaction. Log blocks can + * hold many log records and the blocks are chained together. + * Each ZIL block contains a block pointer (blkptr_t) to the next + * ZIL block in the chain. The ZIL header points to the first + * block in the chain. Note there is not a fixed place in the pool + * to hold blocks. They are dynamically allocated and freed as + * needed from the blocks available. Figure X shows the ZIL structure: + */ + +/* + * These global ZIL switches affect all pools + */ +int zil_disable = 0; /* disable intent logging */ +int zil_always = 0; /* make every transaction synchronous */ +int zil_purge = 0; /* at pool open, just throw everything away */ +int zil_noflush = 0; /* don't flush write cache buffers on disks */ + +static kmem_cache_t *zil_lwb_cache; + +static int +zil_dva_compare(const void *x1, const void *x2) +{ + const dva_t *dva1 = x1; + const dva_t *dva2 = x2; + + if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) + return (-1); + if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) + return (1); + + if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) + return (-1); + if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) + return (1); + + return (0); +} + +static void +zil_dva_tree_init(avl_tree_t *t) +{ + avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t), + offsetof(zil_dva_node_t, zn_node)); +} + +static void +zil_dva_tree_fini(avl_tree_t *t) +{ + zil_dva_node_t *zn; + void *cookie = NULL; + + while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zn, sizeof (zil_dva_node_t)); + + avl_destroy(t); +} + +static int +zil_dva_tree_add(avl_tree_t *t, dva_t *dva) +{ + zil_dva_node_t *zn; + avl_index_t where; + + if (avl_find(t, dva, &where) != NULL) + return (EEXIST); + + zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP); + zn->zn_dva = *dva; + avl_insert(t, zn, where); + + return (0); +} + +/* + * Read a log block, make sure it's valid, and byteswap it if necessary. + */ +static int +zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf) +{ + uint64_t blksz = BP_GET_LSIZE(bp); + zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1; + zio_cksum_t cksum; + int error; + + error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz, + NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + if (error) { + dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ", + zilog, bp, error); + return (error); + } + + if (BP_SHOULD_BYTESWAP(bp)) + byteswap_uint64_array(buf, blksz); + + /* + * Sequence numbers should be... sequential. The checksum verifier for + * the next block should be: <logid[0], logid[1], objset id, seq + 1>. + */ + cksum = bp->blk_cksum; + cksum.zc_word[3]++; + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) { + dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp); + return (ESTALE); + } + + if (BP_IS_HOLE(&ztp->zit_next_blk)) { + dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp); + return (ENOENT); + } + + if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) { + dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp); + return (EOVERFLOW); + } + + dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp); + + return (0); +} + +/* + * Parse the intent log, and call parse_func for each valid record within. + */ +void +zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) +{ + blkptr_t blk; + char *lrbuf, *lrp; + zil_trailer_t *ztp; + int reclen, error; + + blk = zilog->zl_header->zh_log; + if (BP_IS_HOLE(&blk)) + return; + + /* + * Starting at the block pointed to by zh_log we read the log chain. + * For each block in the chain we strongly check that block to + * ensure its validity. We stop when an invalid block is found. + * For each block pointer in the chain we call parse_blk_func(). + * For each record in each valid block we call parse_lr_func(). + */ + zil_dva_tree_init(&zilog->zl_dva_tree); + lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); + for (;;) { + error = zil_read_log_block(zilog, &blk, lrbuf); + + if (parse_blk_func != NULL) + parse_blk_func(zilog, &blk, arg, txg); + + if (error) + break; + + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + + if (parse_lr_func == NULL) + continue; + + for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { + lr_t *lr = (lr_t *)lrp; + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + parse_lr_func(zilog, lr, arg, txg); + } + } + zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); + zil_dva_tree_fini(&zilog->zl_dva_tree); +} + +/* ARGSUSED */ +static void +zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + spa_t *spa = zilog->zl_spa; + int err; + + dprintf_bp(bp, "first_txg %llu: ", first_txg); + + /* + * Claim log block if not already committed and not already claimed. + */ + if (bp->blk_birth >= first_txg && + zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { + err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL)); + ASSERT(err == 0); + } +} + +static void +zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg); + } +} + +/* ARGSUSED */ +static void +zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +{ + zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx)); +} + +static void +zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +{ + /* + * If we previously claimed it, we need to free it. + */ + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + if (bp->blk_birth >= claim_txg && + !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) { + (void) arc_free(NULL, zilog->zl_spa, + dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT); + } + } +} + +/* + * Create an on-disk intent log. + */ +static void +zil_create(zilog_t *zilog) +{ + lwb_t *lwb; + uint64_t txg; + dmu_tx_t *tx; + blkptr_t blk; + int error; + + ASSERT(zilog->zl_header->zh_claim_txg == 0); + ASSERT(zilog->zl_header->zh_replay_seq == 0); + + /* + * Initialize the log header block. + */ + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + /* + * Allocate the first log block and assign its checksum verifier. + */ + error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, + ZIL_MIN_BLKSZ, &blk, txg); + if (error == 0) { + ZIO_SET_CHECKSUM(&blk.blk_cksum, + spa_get_random(-1ULL), spa_get_random(-1ULL), + dmu_objset_id(zilog->zl_os), 1ULL); + + /* + * Allocate a log write buffer (lwb) for the first log block. + */ + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = blk; + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + lwb->lwb_max_txg = txg; + lwb->lwb_seq = 0; + lwb->lwb_state = UNWRITTEN; + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + } + + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); +} + +/* + * In one tx, free all log blocks and clear the log header. + */ +void +zil_destroy(zilog_t *zilog) +{ + dmu_tx_t *tx; + uint64_t txg; + + mutex_enter(&zilog->zl_destroy_lock); + + if (BP_IS_HOLE(&zilog->zl_header->zh_log)) { + mutex_exit(&zilog->zl_destroy_lock); + return; + } + + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, + zilog->zl_header->zh_claim_txg); + zilog->zl_destroy_txg = txg; + + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + + mutex_exit(&zilog->zl_destroy_lock); +} + +void +zil_claim(char *osname, void *txarg) +{ + dmu_tx_t *tx = txarg; + uint64_t first_txg = dmu_tx_get_txg(tx); + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os); + if (error) { + cmn_err(CE_WARN, "can't process intent log for %s", osname); + return; + } + + zilog = dmu_objset_zil(os); + zh = zilog->zl_header; + + /* + * Claim all log blocks if we haven't already done so. + */ + ASSERT3U(zh->zh_claim_txg, <=, first_txg); + if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { + zh->zh_claim_txg = first_txg; + zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, + tx, first_txg); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); + dmu_objset_close(os); +} + +void +zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq) +{ + zil_vdev_t *zv; + + if (zil_noflush) + return; + + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP); + zv->vdev = vdev; + zv->seq = seq; + list_insert_tail(&zilog->zl_vdev_list, zv); +} + + +void +zil_flush_vdevs(zilog_t *zilog, uint64_t seq) +{ + vdev_t *vd; + zil_vdev_t *zv, *zv2; + zio_t *zio; + spa_t *spa; + uint64_t vdev; + + if (zil_noflush) + return; + + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + + spa = zilog->zl_spa; + zio = NULL; + + while ((zv = list_head(&zilog->zl_vdev_list)) != NULL && + zv->seq <= seq) { + vdev = zv->vdev; + list_remove(&zilog->zl_vdev_list, zv); + kmem_free(zv, sizeof (zil_vdev_t)); + + /* + * remove all chained entries <= seq with same vdev + */ + zv = list_head(&zilog->zl_vdev_list); + while (zv && zv->seq <= seq) { + zv2 = list_next(&zilog->zl_vdev_list, zv); + if (zv->vdev == vdev) { + list_remove(&zilog->zl_vdev_list, zv); + kmem_free(zv, sizeof (zil_vdev_t)); + } + zv = zv2; + } + + /* flush the write cache for this vdev */ + mutex_exit(&zilog->zl_lock); + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + vd = vdev_lookup_top(spa, vdev); + ASSERT(vd); + (void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + mutex_enter(&zilog->zl_lock); + } + + /* + * Wait for all the flushes to complete. Not all devices actually + * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. + */ + if (zio != NULL) + (void) zio_wait(zio); +} + +/* + * Function called when a log block write completes + */ +static void +zil_lwb_write_done(zio_t *zio) +{ + lwb_t *prev; + lwb_t *lwb = zio->io_private; + zilog_t *zilog = lwb->lwb_zilog; + uint64_t max_seq; + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + txg_rele_to_sync(&lwb->lwb_txgh); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + mutex_enter(&zilog->zl_lock); + lwb->lwb_buf = NULL; + if (zio->io_error) { + zilog->zl_log_error = B_TRUE; + mutex_exit(&zilog->zl_lock); + cv_broadcast(&zilog->zl_cv_seq); + return; + } + + prev = list_prev(&zilog->zl_lwb_list, lwb); + if (prev && prev->lwb_state != SEQ_COMPLETE) { + /* There's an unwritten buffer in the chain before this one */ + lwb->lwb_state = SEQ_INCOMPLETE; + mutex_exit(&zilog->zl_lock); + return; + } + + max_seq = lwb->lwb_seq; + lwb->lwb_state = SEQ_COMPLETE; + /* + * We must also follow up the chain for already written buffers + * to see if we can set zl_ss_seq even higher. + */ + while (lwb = list_next(&zilog->zl_lwb_list, lwb)) { + if (lwb->lwb_state != SEQ_INCOMPLETE) + break; + lwb->lwb_state = SEQ_COMPLETE; + /* lwb_seq will be zero if we've written an empty buffer */ + if (lwb->lwb_seq) { + ASSERT3U(max_seq, <, lwb->lwb_seq); + max_seq = lwb->lwb_seq; + } + } + zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq); + mutex_exit(&zilog->zl_lock); + cv_broadcast(&zilog->zl_cv_seq); +} + +/* + * Start a log block write and advance to the next log block. + * Calls are serialized. + */ +static lwb_t * +zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *nlwb; + zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + uint64_t txg; + uint64_t zil_blksz; + int error; + + ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); + + /* + * Allocate the next block and save its address in this block + * before writing it in order to establish the log chain. + * Note that if the allocation of nlwb synced before we wrote + * the block that points at it (lwb), we'd leak it if we crashed. + * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). + */ + txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); + txg_rele_to_quiesce(&lwb->lwb_txgh); + + /* + * Pick a ZIL blocksize based upon the size of the outstanding + * in-memory transactions, or if none the same size as the + * last block. + */ + if (zilog->zl_itx_list_sz) { + zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp); + zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ); + if (zil_blksz > ZIL_MAX_BLKSZ) + zil_blksz = ZIL_MAX_BLKSZ; + zilog->zl_prev_blk_sz = zil_blksz; + } else { + zil_blksz = zilog->zl_prev_blk_sz; + } + + error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, + zil_blksz, &ztp->zit_next_blk, txg); + if (error) { + txg_rele_to_sync(&lwb->lwb_txgh); + return (NULL); + } + + ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg); + ztp->zit_nused = lwb->lwb_nused; + ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; + ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum; + ztp->zit_next_blk.blk_cksum.zc_word[3]++; + + /* + * Allocate a new log write buffer (lwb). + */ + nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + + nlwb->lwb_zilog = zilog; + nlwb->lwb_blk = ztp->zit_next_blk; + nlwb->lwb_nused = 0; + nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); + nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); + nlwb->lwb_max_txg = txg; + nlwb->lwb_seq = 0; + nlwb->lwb_state = UNWRITTEN; + + /* + * Put new lwb at the end of the log chain, + * and record the vdev for later flushing + */ + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, nlwb); + zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))), + lwb->lwb_seq); + mutex_exit(&zilog->zl_lock); + + /* + * write the old log block + */ + dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); + zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0, + &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, + ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED)); + + return (nlwb); +} + +static lwb_t * +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +{ + lr_t *lrc = &itx->itx_lr; /* common log record */ + uint64_t seq = lrc->lrc_seq; + uint64_t txg = lrc->lrc_txg; + uint64_t reclen = lrc->lrc_reclen; + int error; + + if (lwb == NULL) + return (NULL); + ASSERT(lwb->lwb_buf != NULL); + + /* + * If it's a write, fetch the data or get its blkptr as appropriate. + */ + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); + + if (!itx->itx_data_copied && + (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) { + if (error != ENOENT && error != EALREADY) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + mutex_enter(&zilog->zl_lock); + zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq); + zil_add_vdev(zilog, + DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), + seq); + mutex_exit(&zilog->zl_lock); + return (lwb); + } + mutex_enter(&zilog->zl_lock); + zil_add_vdev(zilog, + DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq); + mutex_exit(&zilog->zl_lock); + return (lwb); + } + } + + /* + * If this record won't fit in the current log block, start a new one. + */ + if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) { + lwb = zil_lwb_write_start(zilog, lwb); + if (lwb == NULL) + return (NULL); + if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + mutex_enter(&zilog->zl_lock); + zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq); + mutex_exit(&zilog->zl_lock); + return (lwb); + } + } + + bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen); + lwb->lwb_nused += reclen; + lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); + ASSERT3U(lwb->lwb_seq, <, seq); + lwb->lwb_seq = seq; + ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb)); + ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); + + return (lwb); +} + +itx_t * +zil_itx_create(int txtype, size_t lrsize) +{ + itx_t *itx; + + lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t)); + + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx->itx_lr.lrc_txtype = txtype; + itx->itx_lr.lrc_reclen = lrsize; + itx->itx_lr.lrc_seq = 0; /* defensive */ + + return (itx); +} + +uint64_t +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t seq; + + ASSERT(itx->itx_lr.lrc_seq == 0); + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen; + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + + return (seq); +} + +/* + * Free up all in-memory intent log transactions that have now been synced. + */ +static void +zil_itx_clean(zilog_t *zilog) +{ + uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); + uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); + uint64_t max_seq = 0; + itx_t *itx; + + mutex_enter(&zilog->zl_lock); + while ((itx = list_head(&zilog->zl_itx_list)) != NULL && + itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { + list_remove(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen; + ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq); + max_seq = itx->itx_lr.lrc_seq; + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + if (max_seq > zilog->zl_ss_seq) { + zilog->zl_ss_seq = max_seq; + cv_broadcast(&zilog->zl_cv_seq); + } + mutex_exit(&zilog->zl_lock); +} + +void +zil_clean(zilog_t *zilog) +{ + /* + * Check for any log blocks that can be freed. + * Log blocks are only freed when the log block allocation and + * log records contained within are both known to be committed. + */ + mutex_enter(&zilog->zl_lock); + if (list_head(&zilog->zl_itx_list) != NULL) + (void) taskq_dispatch(zilog->zl_clean_taskq, + (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + mutex_exit(&zilog->zl_lock); +} + +/* + * Push zfs transactions to stable storage up to the supplied sequence number. + */ +void +zil_commit(zilog_t *zilog, uint64_t seq, int ioflag) +{ + uint64_t txg; + uint64_t max_seq; + uint64_t reclen; + itx_t *itx; + lwb_t *lwb; + spa_t *spa; + + if (zilog == NULL || seq == 0 || + ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always)) + return; + + spa = zilog->zl_spa; + mutex_enter(&zilog->zl_lock); + + seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ + + for (;;) { + if (zilog->zl_ss_seq >= seq) { /* already on stable storage */ + cv_signal(&zilog->zl_cv_write); + mutex_exit(&zilog->zl_lock); + return; + } + + if (zilog->zl_writer == B_FALSE) /* no one writing, do it */ + break; + + cv_wait(&zilog->zl_cv_write, &zilog->zl_lock); + } + + zilog->zl_writer = B_TRUE; + max_seq = 0; + + if (zilog->zl_suspend) { + lwb = NULL; + } else { + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) { + mutex_exit(&zilog->zl_lock); + zil_create(zilog); + mutex_enter(&zilog->zl_lock); + lwb = list_tail(&zilog->zl_lwb_list); + } + } + + /* + * Loop through in-memory log transactions filling log blocks, + * until we reach the given sequence number and there's no more + * room in the write buffer. + */ + for (;;) { + itx = list_head(&zilog->zl_itx_list); + if (itx == NULL) + break; + + reclen = itx->itx_lr.lrc_reclen; + if ((itx->itx_lr.lrc_seq > seq) && + ((lwb == NULL) || (lwb->lwb_nused + reclen > + ZIL_BLK_DATA_SZ(lwb)))) + break; + + list_remove(&zilog->zl_itx_list, itx); + txg = itx->itx_lr.lrc_txg; + ASSERT(txg); + + mutex_exit(&zilog->zl_lock); + if (txg > spa_last_synced_txg(spa) || + txg > spa_freeze_txg(spa)) + lwb = zil_lwb_commit(zilog, itx, lwb); + else + max_seq = itx->itx_lr.lrc_seq; + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + mutex_enter(&zilog->zl_lock); + zilog->zl_itx_list_sz -= reclen; + } + + mutex_exit(&zilog->zl_lock); + + /* write the last block out */ + if (lwb != NULL && lwb->lwb_nused != 0) + lwb = zil_lwb_write_start(zilog, lwb); + + /* wake up others waiting to start a write */ + mutex_enter(&zilog->zl_lock); + zilog->zl_writer = B_FALSE; + cv_signal(&zilog->zl_cv_write); + + if (max_seq > zilog->zl_ss_seq) { + zilog->zl_ss_seq = max_seq; + cv_broadcast(&zilog->zl_cv_seq); + } + /* + * Wait if necessary for our seq to be committed. + */ + if (lwb) { + while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0) + cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); + zil_flush_vdevs(zilog, seq); + } + if (zilog->zl_log_error || lwb == NULL) { + zilog->zl_log_error = 0; + max_seq = zilog->zl_itx_seq; + mutex_exit(&zilog->zl_lock); + txg_wait_synced(zilog->zl_dmu_pool, 0); + mutex_enter(&zilog->zl_lock); + zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq); + cv_broadcast(&zilog->zl_cv_seq); + } + mutex_exit(&zilog->zl_lock); +} + +/* + * Called in syncing context to free committed log blocks and update log header. + */ +void +zil_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = zilog->zl_spa; + lwb_t *lwb; + + ASSERT(zilog->zl_stop_sync == 0); + + zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + + if (zilog->zl_destroy_txg == txg) { + bzero(zilog->zl_header, sizeof (zil_header_t)); + bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + zilog->zl_destroy_txg = 0; + } + + mutex_enter(&zilog->zl_lock); + for (;;) { + lwb = list_head(&zilog->zl_lwb_list); + if (lwb == NULL) { + mutex_exit(&zilog->zl_lock); + return; + } + if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + break; + list_remove(&zilog->zl_lwb_list, lwb); + zio_free_blk(spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + } + zilog->zl_header->zh_log = lwb->lwb_blk; + mutex_exit(&zilog->zl_lock); +} + +void +zil_init(void) +{ + zil_lwb_cache = kmem_cache_create("zil_lwb_cache", + sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +zil_fini(void) +{ + kmem_cache_destroy(zil_lwb_cache); +} + +zilog_t * +zil_alloc(objset_t *os, zil_header_t *zh_phys) +{ + zilog_t *zilog; + + zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + + zilog->zl_header = zh_phys; + zilog->zl_os = os; + zilog->zl_spa = dmu_objset_spa(os); + zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ; + + list_create(&zilog->zl_itx_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + + list_create(&zilog->zl_lwb_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); + + list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t), + offsetof(zil_vdev_t, vdev_seq_node)); + + return (zilog); +} + +void +zil_free(zilog_t *zilog) +{ + lwb_t *lwb; + zil_vdev_t *zv; + + zilog->zl_stop_sync = 1; + + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + kmem_cache_free(zil_lwb_cache, lwb); + } + list_destroy(&zilog->zl_lwb_list); + + while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { + list_remove(&zilog->zl_vdev_list, zv); + kmem_free(zv, sizeof (zil_vdev_t)); + } + list_destroy(&zilog->zl_vdev_list); + + ASSERT(list_head(&zilog->zl_itx_list) == NULL); + list_destroy(&zilog->zl_itx_list); + + kmem_free(zilog, sizeof (zilog_t)); +} + +/* + * Open an intent log. + */ +zilog_t * +zil_open(objset_t *os, zil_get_data_t *get_data) +{ + zilog_t *zilog = dmu_objset_zil(os); + + zilog->zl_get_data = get_data; + zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, + 2, 2, TASKQ_PREPOPULATE); + + return (zilog); +} + +/* + * Close an intent log. + */ +void +zil_close(zilog_t *zilog) +{ + txg_wait_synced(zilog->zl_dmu_pool, 0); + taskq_destroy(zilog->zl_clean_taskq); + zilog->zl_clean_taskq = NULL; + zilog->zl_get_data = NULL; + + zil_itx_clean(zilog); + ASSERT(list_head(&zilog->zl_itx_list) == NULL); +} + +/* + * Suspend an intent log. While in suspended mode, we still honor + * synchronous semantics, but we rely on txg_wait_synced() to do it. + * We suspend the log briefly when taking a snapshot so that the snapshot + * contains all the data it's supposed to, and has an empty intent log. + */ +int +zil_suspend(zilog_t *zilog) +{ + lwb_t *lwb; + + mutex_enter(&zilog->zl_lock); + if (zilog->zl_header->zh_claim_txg != 0) { /* unplayed log */ + mutex_exit(&zilog->zl_lock); + return (EBUSY); + } + zilog->zl_suspend++; + mutex_exit(&zilog->zl_lock); + + zil_commit(zilog, UINT64_MAX, FSYNC); + + mutex_enter(&zilog->zl_lock); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + if (lwb->lwb_buf != NULL) { + /* + * Wait for the buffer if it's in the process of + * being written. + */ + if ((lwb->lwb_seq != 0) && + (lwb->lwb_state != SEQ_COMPLETE)) { + cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); + continue; + } + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + } + list_remove(&zilog->zl_lwb_list, lwb); + kmem_cache_free(zil_lwb_cache, lwb); + } + mutex_exit(&zilog->zl_lock); + + zil_destroy(zilog); + + return (0); +} + +void +zil_resume(zilog_t *zilog) +{ + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_suspend != 0); + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); +} + +typedef struct zil_replay_arg { + objset_t *zr_os; + zil_replay_func_t **zr_replay; + void *zr_arg; + void (*zr_rm_sync)(void *arg); + uint64_t *zr_txgp; + boolean_t zr_byteswap; + char *zr_lrbuf; +} zil_replay_arg_t; + +static void +zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +{ + zil_replay_arg_t *zr = zra; + zil_header_t *zh = zilog->zl_header; + uint64_t reclen = lr->lrc_reclen; + uint64_t txtype = lr->lrc_txtype; + int pass, error; + + if (zilog->zl_stop_replay) + return; + + if (lr->lrc_txg < claim_txg) /* already committed */ + return; + + if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ + return; + + /* + * Make a copy of the data so we can revise and extend it. + */ + bcopy(lr, zr->zr_lrbuf, reclen); + + /* + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different data types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. + */ + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lrbuf, reclen); + + /* + * If this is a TX_WRITE with a blkptr, suck in the data. + */ + if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { + lr_write_t *lrw = (lr_write_t *)lr; + blkptr_t *wbp = &lrw->lr_blkptr; + uint64_t wlen = lrw->lr_length; + char *wbuf = zr->zr_lrbuf + reclen; + + if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ + bzero(wbuf, wlen); + } else { + /* + * A subsequent write may have overwritten this block, + * in which case wbp may have been been freed and + * reallocated, and our read of wbp may fail with a + * checksum error. We can safely ignore this because + * the later write will provide the correct data. + */ + (void) zio_wait(zio_read(NULL, zilog->zl_spa, + wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); + } + } + + /* + * We must now do two things atomically: replay this log record, + * and update the log header to reflect the fact that we did so. + * We use the DMU's ability to assign into a specific txg to do this. + */ + for (pass = 1; /* CONSTANTCONDITION */; pass++) { + uint64_t replay_txg; + dmu_tx_t *replay_tx; + + replay_tx = dmu_tx_create(zr->zr_os); + error = dmu_tx_assign(replay_tx, TXG_WAIT); + if (error) { + dmu_tx_abort(replay_tx); + break; + } + + replay_txg = dmu_tx_get_txg(replay_tx); + + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + } else { + /* + * On the first pass, arrange for the replay vector + * to fail its dmu_tx_assign(). That's the only way + * to ensure that those code paths remain well tested. + */ + *zr->zr_txgp = replay_txg - (pass == 1); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap); + *zr->zr_txgp = TXG_NOWAIT; + } + + if (error == 0) { + dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); + zilog->zl_replay_seq[replay_txg & TXG_MASK] = + lr->lrc_seq; + } + + dmu_tx_commit(replay_tx); + + if (error != ERESTART) + break; + + if (pass != 1) + txg_wait_open(spa_get_dsl(zilog->zl_spa), + replay_txg + 1); + + dprintf("pass %d, retrying\n", pass); + } + + if (error) { + char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dmu_objset_name(zr->zr_os, name); + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu\n", + error, name, + (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype); + zilog->zl_stop_replay = 1; + kmem_free(name, MAXNAMELEN); + } + + /* + * The DMU's dnode layer doesn't see removes until the txg commits, + * so a subsequent claim can spuriously fail with EEXIST. + * To prevent this, if we might have removed an object, + * wait for the delete thread to delete it, and then + * wait for the transaction group to sync. + */ + if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) { + if (zr->zr_rm_sync != NULL) + zr->zr_rm_sync(zr->zr_arg); + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + } +} + +/* + * If this dataset has an intent log, replay it and destroy it. + */ +void +zil_replay(objset_t *os, void *arg, uint64_t *txgp, + zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg)) +{ + zilog_t *zilog = dmu_objset_zil(os); + zil_replay_arg_t zr; + + zr.zr_os = os; + zr.zr_replay = replay_func; + zr.zr_arg = arg; + zr.zr_rm_sync = rm_sync; + zr.zr_txgp = txgp; + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log); + zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + + /* + * Wait for in-progress removes to sync before starting replay. + */ + if (rm_sync != NULL) + rm_sync(arg); + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zilog->zl_stop_replay = 0; + zil_parse(zilog, NULL, zil_replay_log_record, &zr, + zilog->zl_header->zh_claim_txg); + kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); + + zil_destroy(zilog); +} diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c new file mode 100644 index 0000000000..7323292859 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -0,0 +1,1698 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zio_impl.h> +#include <sys/zio_compress.h> +#include <sys/zio_checksum.h> + +static void zio_vdev_io_enter(zio_t *zio); +static void zio_vdev_io_exit(zio_t *zio); + +/* + * ========================================================================== + * I/O priority table + * ========================================================================== + */ +uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { + 0, /* ZIO_PRIORITY_NOW */ + 0, /* ZIO_PRIORITY_SYNC_READ */ + 0, /* ZIO_PRIORITY_SYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 4, /* ZIO_PRIORITY_FREE */ + 0, /* ZIO_PRIORITY_CACHE_FILL */ + 0, /* ZIO_PRIORITY_LOG_WRITE */ + 10, /* ZIO_PRIORITY_RESILVER */ + 20, /* ZIO_PRIORITY_SCRUB */ +}; + +/* + * ========================================================================== + * I/O type descriptions + * ========================================================================== + */ +char *zio_type_name[ZIO_TYPES] = { + "null", "read", "write", "free", "claim", "ioctl" }; + +/* At or above this size, force gang blocking - for testing */ +uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; + +typedef struct zio_sync_pass { + int zp_defer_free; /* defer frees after this pass */ + int zp_dontcompress; /* don't compress after this pass */ + int zp_rewrite; /* rewrite new bps after this pass */ +} zio_sync_pass_t; + +zio_sync_pass_t zio_sync_pass = { + 1, /* zp_defer_free */ + 4, /* zp_dontcompress */ + 1, /* zp_rewrite */ +}; + +/* + * ========================================================================== + * I/O kmem caches + * ========================================================================== + */ +kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; + +void +zio_init(void) +{ + size_t c; + + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache + * for each quarter-power of 2. For large buffers, we want + * a cache for each multiple of PAGESIZE. + */ + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + + while (p2 & (p2 - 1)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (P2PHASE(size, PAGESIZE) == 0) { + align = PAGESIZE; + } else if (P2PHASE(size, p2 >> 2) == 0) { + align = p2 >> 2; + } + + if (align != 0) { + char name[30]; + (void) sprintf(name, "zio_buf_%lu", size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, 0); + dprintf("creating cache for size %5lx align %5lx\n", + size, align); + } + } + + while (--c != 0) { + ASSERT(zio_buf_cache[c] != NULL); + if (zio_buf_cache[c - 1] == NULL) + zio_buf_cache[c - 1] = zio_buf_cache[c]; + } +} + +void +zio_fini(void) +{ + size_t c; + kmem_cache_t *last_cache = NULL; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + if (zio_buf_cache[c] != last_cache) { + last_cache = zio_buf_cache[c]; + kmem_cache_destroy(zio_buf_cache[c]); + } + zio_buf_cache[c] = NULL; + } +} + +/* + * ========================================================================== + * Allocate and free I/O buffers + * ========================================================================== + */ +void * +zio_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); +} + +void +zio_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_buf_cache[c], buf); +} + +/* + * ========================================================================== + * Push and pop I/O transform buffers + * ========================================================================== + */ +static void +zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) +{ + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + + zt->zt_data = data; + zt->zt_size = size; + zt->zt_bufsize = bufsize; + + zt->zt_next = zio->io_transform_stack; + zio->io_transform_stack = zt; + + zio->io_data = data; + zio->io_size = size; +} + +static void +zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) +{ + zio_transform_t *zt = zio->io_transform_stack; + + *data = zt->zt_data; + *size = zt->zt_size; + *bufsize = zt->zt_bufsize; + + zio->io_transform_stack = zt->zt_next; + kmem_free(zt, sizeof (zio_transform_t)); + + if ((zt = zio->io_transform_stack) != NULL) { + zio->io_data = zt->zt_data; + zio->io_size = zt->zt_size; + } +} + +static void +zio_clear_transform_stack(zio_t *zio) +{ + void *data; + uint64_t size, bufsize; + + ASSERT(zio->io_transform_stack != NULL); + + zio_pop_transform(zio, &data, &size, &bufsize); + while (zio->io_transform_stack != NULL) { + zio_buf_free(data, bufsize); + zio_pop_transform(zio, &data, &size, &bufsize); + } +} + +/* + * ========================================================================== + * Create the various types of I/O (read, write, free) + * ========================================================================== + */ +static zio_t * +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) +{ + zio_t *zio; + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + + zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + zio->io_parent = pio; + zio->io_spa = spa; + zio->io_txg = txg; + if (bp != NULL) { + zio->io_bp = bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; + /* XXBP - Need to inherit this when it matters */ + zio->io_dva_index = 0; + } + zio->io_done = done; + zio->io_private = private; + zio->io_type = type; + zio->io_priority = priority; + zio->io_stage = stage; + zio->io_pipeline = pipeline; + zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; + zio->io_timestamp = lbolt64; + zio->io_flags = flags; + zio_push_transform(zio, data, size, size); + + if (pio == NULL) { + if (!(flags & ZIO_FLAG_CONFIG_HELD)) + spa_config_enter(zio->io_spa, RW_READER); + zio->io_root = zio; + } else { + zio->io_root = pio->io_root; + + mutex_enter(&pio->io_lock); + if (stage < ZIO_STAGE_READY) + pio->io_children_notready++; + pio->io_children_notdone++; + zio->io_sibling_next = pio->io_child; + zio->io_sibling_prev = NULL; + if (pio->io_child != NULL) + pio->io_child->io_sibling_prev = zio; + pio->io_child = zio; + mutex_exit(&pio->io_lock); + } + + return (zio); +} + +zio_t * +zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, + int flags) +{ + zio_t *zio; + + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, + ZIO_WAIT_FOR_CHILDREN_PIPELINE); + + return (zio); +} + +zio_t * +zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) +{ + return (zio_null(NULL, spa, done, private, flags)); +} + +zio_t * +zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags) +{ + zio_t *zio; + dva_t *dva; + + ASSERT3U(size, ==, BP_GET_LSIZE(bp)); + + zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, + ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); + + /* + * Work off our copy of the bp so the caller can free it. + */ + zio->io_bp = &zio->io_bp_copy; + + bp = zio->io_bp; + dva = ZIO_GET_DVA(zio); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint64_t csize = BP_GET_PSIZE(bp); + void *cbuf = zio_buf_alloc(csize); + + zio_push_transform(zio, cbuf, csize, csize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; + } + + if (DVA_GET_GANG(dva)) { + uint64_t gsize = SPA_GANGBLOCKSIZE; + void *gbuf = zio_buf_alloc(gsize); + + zio_push_transform(zio, gbuf, gsize, gsize); + zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; + } + + return (zio); +} + +zio_t * +zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + + ASSERT(checksum >= ZIO_CHECKSUM_OFF && + checksum < ZIO_CHECKSUM_FUNCTIONS); + + ASSERT(compress >= ZIO_COMPRESS_OFF && + compress < ZIO_COMPRESS_FUNCTIONS); + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, + ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + + zio->io_checksum = checksum; + zio->io_compress = compress; + + if (compress != ZIO_COMPRESS_OFF) + zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; + + if (bp->blk_birth != txg) { + /* XXX the bp usually (always?) gets re-zeroed later */ + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + } + + return (zio); +} + +zio_t * +zio_rewrite(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + + /* XXBP - We need to re-evaluate when to insert pipeline stages */ + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + + zio->io_checksum = checksum; + zio->io_compress = ZIO_COMPRESS_OFF; + + return (zio); +} + +static zio_t * +zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, + uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + + BP_ZERO(bp); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + + zio = zio_create(pio, spa, txg, bp, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, + ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); + + zio->io_checksum = checksum; + zio->io_compress = ZIO_COMPRESS_OFF; + + return (zio); +} + +zio_t * +zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + ASSERT(!BP_IS_HOLE(bp)); + + if (txg == spa->spa_syncing_txg && + spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { + bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); + return (zio_null(pio, spa, NULL, NULL, 0)); + } + + /* XXBP - We need to re-evaluate when to insert pipeline stages */ + zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, + ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, + ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); + + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + /* + * A claim is an allocation of a specific block. Claims are needed + * to support immediate writes in the intent log. The issue is that + * immediate writes contain committed data, but in a txg that was + * *not* committed. Upon opening the pool after an unclean shutdown, + * the intent log claims all blocks that contain immediate write data + * so that the SPA knows they're in use. + * + * All claims *must* be resolved in the first txg -- before the SPA + * starts allocating blocks -- so that nothing is allocated twice. + */ + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); + ASSERT3U(spa_first_txg(spa), <=, txg); + + /* XXBP - We need to re-evaluate when to insert pipeline stages */ + zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, + ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, + ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, int priority, int flags) +{ + zio_t *zio; + int c; + + if (vd->vdev_children == 0) { + zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, + ZIO_TYPE_IOCTL, priority, flags, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + + zio->io_vd = vd; + zio->io_cmd = cmd; + } else { + zio = zio_null(pio, spa, NULL, NULL, flags); + + for (c = 0; c < vd->vdev_children; c++) + zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, + done, private, priority, flags)); + } + + return (zio); +} + +static void +zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, + int checksum) +{ + ASSERT(vd->vdev_children == 0); + + ASSERT(size <= SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + BP_ZERO(bp); + + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + + BP_SET_CHECKSUM(bp, checksum); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + if (checksum != ZIO_CHECKSUM_OFF) + ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); +} + +zio_t * +zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags) +{ + zio_t *zio; + blkptr_t blk; + + zio_phys_bp_init(vd, &blk, offset, size, checksum); + + zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, + ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, + ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + + zio->io_vd = vd; + zio->io_offset = offset; + + /* + * Work off our copy of the bp so the caller can free it. + */ + zio->io_bp = &zio->io_bp_copy; + + return (zio); +} + +zio_t * +zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + void *data, int checksum, zio_done_func_t *done, void *private, + int priority, int flags) +{ + zio_block_tail_t *zbt; + void *wbuf; + zio_t *zio; + blkptr_t blk; + + zio_phys_bp_init(vd, &blk, offset, size, checksum); + + zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, + ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + + zio->io_vd = vd; + zio->io_offset = offset; + + zio->io_bp = &zio->io_bp_copy; + zio->io_checksum = checksum; + + if (zio_checksum_table[checksum].ci_zbt) { + /* + * zbt checksums are necessarily destructive -- they modify + * one word of the write buffer to hold the verifier/checksum. + * Therefore, we must make a local copy in case the data is + * being written to multiple places. + */ + wbuf = zio_buf_alloc(size); + bcopy(data, wbuf, size); + zio_push_transform(zio, wbuf, size, size); + + zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; + zbt->zbt_cksum = blk.blk_cksum; + } + + return (zio); +} + +/* + * Create a child I/O to do some work for us. It has no associated bp. + */ +zio_t * +zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, int flags, + zio_done_func_t *done, void *private) +{ + uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; + zio_t *cio; + + if (type == ZIO_TYPE_READ && bp != NULL) { + /* + * If we have the bp, then the child should perform the + * checksum and the parent need not. This pushes error + * detection as close to the leaves as possible and + * eliminates redundant checksums in the interior nodes. + */ + pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; + zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + } + + cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, + done, private, type, priority, + (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, + ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); + + cio->io_vd = vd; + cio->io_offset = offset; + + return (cio); +} + +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + int error; + + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + + zio->io_waiter = curthread; + + zio_next_stage_async(zio); + + mutex_enter(&zio->io_lock); + while (zio->io_stalled != ZIO_STAGE_DONE) + cv_wait(&zio->io_cv, &zio->io_lock); + mutex_exit(&zio->io_lock); + + error = zio->io_error; + + kmem_free(zio, sizeof (zio_t)); + + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + zio_next_stage_async(zio); +} + +/* + * ========================================================================== + * I/O pipeline interlocks: parent/child dependency scoreboarding + * ========================================================================== + */ +static void +zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) +{ + mutex_enter(&zio->io_lock); + if (*countp == 0) { + ASSERT(zio->io_stalled == 0); + mutex_exit(&zio->io_lock); + zio_next_stage(zio); + } else { + if (zio->io_stage == ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_exit(zio); + zio->io_stalled = stage; + mutex_exit(&zio->io_lock); + } +} + +static void +zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) +{ + zio_t *pio = zio->io_parent; + + mutex_enter(&pio->io_lock); + if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + pio->io_error = zio->io_error; + if (--*countp == 0 && pio->io_stalled == stage) { + if (pio->io_stage == ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_enter(pio); + pio->io_stalled = 0; + mutex_exit(&pio->io_lock); + zio_next_stage_async(pio); + } else { + mutex_exit(&pio->io_lock); + } +} + +static void +zio_wait_children_ready(zio_t *zio) +{ + zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + &zio->io_children_notready); +} + +void +zio_wait_children_done(zio_t *zio) +{ + zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + &zio->io_children_notdone); +} + +static void +zio_ready(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + + if (pio != NULL) + zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + &pio->io_children_notready); + + if (zio->io_bp) + zio->io_bp_copy = *zio->io_bp; + + zio_next_stage(zio); +} + +static void +zio_done(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + vdev_t *vd = zio->io_vd; + char blkbuf[300]; + + ASSERT(zio->io_children_notready == 0); + ASSERT(zio->io_children_notdone == 0); + + if (bp != NULL) { + ASSERT(bp->blk_pad[0] == 0); + ASSERT(bp->blk_pad[1] == 0); + ASSERT(bp->blk_pad[2] == 0); + ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + } + + if (vd != NULL) + vdev_stat_update(zio); + + if (zio->io_error) { + sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); + dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n", + zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", + zio_type_name[zio->io_type], + vdev_description(vd), + (u_longlong_t)zio->io_offset, + zio, blkbuf, zio->io_error); + } + + if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) { + sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); + dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n", + "partial write", + zio_type_name[zio->io_type], + vdev_description(vd), + (u_longlong_t)zio->io_offset, + zio, blkbuf, zio->io_numerrors); + } + + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); + panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d", + zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", + zio_type_name[zio->io_type], + vdev_description(vd), + (u_longlong_t)zio->io_offset, + zio, blkbuf, zio->io_error); + } + + zio_clear_transform_stack(zio); + + if (zio->io_done) + zio->io_done(zio); + + ASSERT(zio->io_delegate_list == NULL); + ASSERT(zio->io_delegate_next == NULL); + + if (pio != NULL) { + zio_t *next, *prev; + + mutex_enter(&pio->io_lock); + next = zio->io_sibling_next; + prev = zio->io_sibling_prev; + if (next != NULL) + next->io_sibling_prev = prev; + if (prev != NULL) + prev->io_sibling_next = next; + if (pio->io_child == zio) + pio->io_child = next; + mutex_exit(&pio->io_lock); + + zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + &pio->io_children_notdone); + } + + if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) + spa_config_exit(spa); + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio->io_stalled = zio->io_stage; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + kmem_free(zio, sizeof (zio_t)); + } +} + +/* + * ========================================================================== + * Compression support + * ========================================================================== + */ +static void +zio_write_compress(zio_t *zio) +{ + int compress = zio->io_compress; + blkptr_t *bp = zio->io_bp; + void *cbuf; + uint64_t lsize = zio->io_size; + uint64_t csize = lsize; + uint64_t cbufsize = 0; + int pass; + + if (bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(zio->io_spa); + if (pass > zio_sync_pass.zp_dontcompress) + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT(BP_IS_HOLE(bp)); + pass = 1; + } + + if (compress != ZIO_COMPRESS_OFF) + if (!zio_compress_data(compress, zio->io_data, zio->io_size, + &cbuf, &csize, &cbufsize)) + compress = ZIO_COMPRESS_OFF; + + if (compress != ZIO_COMPRESS_OFF && csize != 0) + zio_push_transform(zio, cbuf, csize, cbufsize); + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to reallocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + pass > zio_sync_pass.zp_rewrite) { + ASSERT(csize != 0); + ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); + ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); + + zio->io_pipeline = ZIO_REWRITE_PIPELINE; + } else { + if (bp->blk_birth == zio->io_txg) { + ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); + bzero(bp, sizeof (blkptr_t)); + } + if (csize == 0) { + BP_ZERO(bp); + zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; + } else { + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, csize); + BP_SET_COMPRESS(bp, compress); + zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; + } + } + + zio_next_stage(zio); +} + +static void +zio_read_decompress(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + void *data; + uint64_t size; + uint64_t bufsize; + int compress = BP_GET_COMPRESS(bp); + + ASSERT(compress != ZIO_COMPRESS_OFF); + + zio_pop_transform(zio, &data, &size, &bufsize); + + if (zio_decompress_data(compress, data, size, + zio->io_data, zio->io_size)) + zio->io_error = EIO; + + zio_buf_free(data, bufsize); + + zio_next_stage(zio); +} + +/* + * ========================================================================== + * Gang block support + * ========================================================================== + */ +static void +zio_gang_pipeline(zio_t *zio) +{ + /* + * By default, the pipeline assumes that we're dealing with a gang + * block. If we're not, strip out any gang-specific stages. + */ + if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) + zio->io_pipeline &= ~ZIO_GANG_STAGES; + + zio_next_stage(zio); +} + +static void +zio_gang_byteswap(zio_t *zio) +{ + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + + if (BP_SHOULD_BYTESWAP(zio->io_bp)) + byteswap_uint64_array(zio->io_data, zio->io_size); +} + +static void +zio_get_gang_header(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + uint64_t gsize = SPA_GANGBLOCKSIZE; + void *gbuf = zio_buf_alloc(gsize); + + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + + zio_push_transform(zio, gbuf, gsize, gsize); + + zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, + NULL, NULL, ZIO_TYPE_READ, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, + ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); + + zio_wait_children_done(zio); +} + +static void +zio_read_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize, loff, lsize; + int i; + + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + lsize = BP_GET_PSIZE(gbp); + + ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); + ASSERT3U(loff + lsize, <=, zio->io_size); + ASSERT(i < SPA_GBH_NBLKPTRS); + ASSERT(!BP_IS_HOLE(gbp)); + + zio_nowait(zio_read(zio, zio->io_spa, gbp, + (char *)zio->io_data + loff, lsize, NULL, NULL, + zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT)); + } + + zio_buf_free(gbh, gbufsize); + zio_wait_children_done(zio); +} + +static void +zio_rewrite_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize, loff, lsize; + int i; + + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + ASSERT(gsize == gbufsize); + + for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + lsize = BP_GET_PSIZE(gbp); + + ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); + ASSERT3U(loff + lsize, <=, zio->io_size); + ASSERT(i < SPA_GBH_NBLKPTRS); + ASSERT(!BP_IS_HOLE(gbp)); + + zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, + zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, + NULL, NULL, zio->io_priority, zio->io_flags)); + } + + zio_push_transform(zio, gbh, gsize, gbufsize); + zio_wait_children_ready(zio); +} + +static void +zio_free_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize; + int i; + + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + + if (BP_IS_HOLE(gbp)) + continue; + zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, + gbp, NULL, NULL)); + } + + zio_buf_free(gbh, gbufsize); + zio_next_stage(zio); +} + +static void +zio_claim_gang_members(zio_t *zio) +{ + zio_gbh_phys_t *gbh; + uint64_t gsize, gbufsize; + int i; + + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + + zio_gang_byteswap(zio); + zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + + for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + if (BP_IS_HOLE(gbp)) + continue; + zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, + gbp, NULL, NULL)); + } + + zio_buf_free(gbh, gbufsize); + zio_next_stage(zio); +} + +static void +zio_write_allocate_gang_member_done(zio_t *zio) +{ + zio_t *pio = zio->io_parent; + dva_t *cdva = ZIO_GET_DVA(zio); + dva_t *pdva = ZIO_GET_DVA(pio); + uint64_t asize; + + ASSERT(DVA_GET_GANG(pdva)); + + /* XXBP - Need to be careful here with multiple DVAs */ + mutex_enter(&pio->io_lock); + asize = DVA_GET_ASIZE(pdva); + asize += DVA_GET_ASIZE(cdva); + DVA_SET_ASIZE(pdva, asize); + mutex_exit(&pio->io_lock); +} + +static void +zio_write_allocate_gang_members(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = ZIO_GET_DVA(zio); + zio_gbh_phys_t *gbh; + uint64_t resid = zio->io_size; + uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); + uint64_t gsize, loff, lsize; + uint32_t gbps_left; + int error; + int i; + + gsize = SPA_GANGBLOCKSIZE; + gbps_left = SPA_GBH_NBLKPTRS; + + error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); + if (error == ENOSPC) + panic("can't allocate gang block header"); + ASSERT(error == 0); + + DVA_SET_GANG(dva, 1); + + bp->blk_birth = zio->io_txg; + + gbh = zio_buf_alloc(gsize); + bzero(gbh, gsize); + + for (loff = 0, i = 0; loff != zio->io_size; + loff += lsize, resid -= lsize, gbps_left--, i++) { + blkptr_t *gbp = &gbh->zg_blkptr[i]; + dva = &gbp->blk_dva[0]; + + ASSERT(gbps_left != 0); + maxalloc = MIN(maxalloc, resid); + + while (resid <= maxalloc * gbps_left) { + error = metaslab_alloc(zio->io_spa, maxalloc, dva, + zio->io_txg); + if (error == 0) + break; + ASSERT3U(error, ==, ENOSPC); + if (maxalloc == SPA_MINBLOCKSIZE) + panic("really out of space"); + maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); + } + + if (resid <= maxalloc * gbps_left) { + lsize = maxalloc; + BP_SET_LSIZE(gbp, lsize); + BP_SET_PSIZE(gbp, lsize); + BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); + gbp->blk_birth = zio->io_txg; + zio_nowait(zio_rewrite(zio, zio->io_spa, + zio->io_checksum, zio->io_txg, gbp, + (char *)zio->io_data + loff, lsize, + zio_write_allocate_gang_member_done, NULL, + zio->io_priority, zio->io_flags)); + } else { + lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); + ASSERT(lsize != SPA_MINBLOCKSIZE); + zio_nowait(zio_write_allocate(zio, zio->io_spa, + zio->io_checksum, zio->io_txg, gbp, + (char *)zio->io_data + loff, lsize, + zio_write_allocate_gang_member_done, NULL, + zio->io_priority, zio->io_flags)); + } + } + + ASSERT(resid == 0 && loff == zio->io_size); + + zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; + + zio_push_transform(zio, gbh, gsize, gsize); + zio_wait_children_done(zio); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ +static void +zio_dva_allocate(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = ZIO_GET_DVA(zio); + int error; + + ASSERT(BP_IS_HOLE(bp)); + + /* For testing, make some blocks above a certain size be gang blocks */ + if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { + zio_write_allocate_gang_members(zio); + return; + } + + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); + + if (error == 0) { + bp->blk_birth = zio->io_txg; + } else if (error == ENOSPC) { + if (zio->io_size == SPA_MINBLOCKSIZE) + panic("really, truly out of space"); + zio_write_allocate_gang_members(zio); + return; + } else { + zio->io_error = error; + } + zio_next_stage(zio); +} + +static void +zio_dva_free(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = ZIO_GET_DVA(zio); + + ASSERT(!BP_IS_HOLE(bp)); + + metaslab_free(zio->io_spa, dva, zio->io_txg); + + BP_ZERO(bp); + + zio_next_stage(zio); +} + +static void +zio_dva_claim(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = ZIO_GET_DVA(zio); + + ASSERT(!BP_IS_HOLE(bp)); + + zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); + + zio_next_stage(zio); +} + +static void +zio_dva_translate(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + dva_t *dva = ZIO_GET_DVA(zio); + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + + ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); + + zio->io_offset = offset; + + if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) + zio->io_error = ENXIO; + else if (offset + zio->io_size > zio->io_vd->vdev_asize) + zio->io_error = EOVERFLOW; + + zio_next_stage(zio); +} + +/* + * ========================================================================== + * Read and write to physical devices + * ========================================================================== + */ +static void +zio_vdev_io_enter(zio_t *zio) +{ + vdev_t *tvd = zio->io_vd->vdev_top; + + mutex_enter(&tvd->vdev_io_lock); + ASSERT(zio->io_pending.list_next == NULL); + list_insert_tail(&tvd->vdev_io_pending, zio); + mutex_exit(&tvd->vdev_io_lock); +} + +static void +zio_vdev_io_exit(zio_t *zio) +{ + vdev_t *tvd = zio->io_vd->vdev_top; + + mutex_enter(&tvd->vdev_io_lock); + ASSERT(zio->io_pending.list_next != NULL); + list_remove(&tvd->vdev_io_pending, zio); + if (list_head(&tvd->vdev_io_pending) == NULL) + cv_broadcast(&tvd->vdev_io_cv); + mutex_exit(&tvd->vdev_io_lock); +} + +static void +zio_vdev_io_retry(void *vdarg) +{ + vdev_t *vd = vdarg; + zio_t *zio, *zq; + + ASSERT(vd == vd->vdev_top); + + /* XXPOLICY */ + delay(hz); + + vdev_reopen(vd, &zq); + + while ((zio = zq) != NULL) { + zq = zio->io_retry_next; + zio->io_retry_next = NULL; + dprintf("async retry #%d for I/O to %s offset %llx\n", + zio->io_retries, vdev_description(vd), zio->io_offset); + zio_next_stage_async(zio); + } +} + +static void +zio_vdev_io_setup(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + /* XXPOLICY */ + if (zio->io_retries == 0 && vd == vd->vdev_top) + zio->io_flags |= ZIO_FLAG_FAILFAST; + + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { + zio->io_flags |= ZIO_FLAG_PHYSICAL; + zio->io_offset += VDEV_LABEL_START_SIZE; + } + + zio_vdev_io_enter(zio); + + zio_next_stage(zio); +} + +static void +zio_vdev_io_start(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); + ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); + ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + + vdev_io_start(zio); + + /* zio_next_stage_async() gets called from io completion interrupt */ +} + +static void +zio_vdev_io_done(zio_t *zio) +{ + vdev_io_done(zio); +} + +/* XXPOLICY */ +static boolean_t +zio_should_retry(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio->io_error == 0) + return (B_FALSE); + if (zio->io_delegate_list != NULL) + return (B_FALSE); + if (vd != vd->vdev_top) + return (B_FALSE); + if (zio->io_flags & ZIO_FLAG_DONT_RETRY) + return (B_FALSE); + if (zio->io_retries > 300 && + (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL))) + return (B_FALSE); + if (zio->io_retries > 1 && + (zio->io_error == ECKSUM || zio->io_error == ENXIO)) + return (B_FALSE); + + return (B_TRUE); +} + +static void +zio_vdev_io_assess(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + zio_vdev_io_exit(zio); + + ASSERT(zio->io_vsd == NULL); + + /* + * If the I/O failed, determine whether we should attempt to retry it. + */ + /* XXPOLICY */ + if (zio_should_retry(zio)) { + zio_t *zq; + + ASSERT(tvd == vd); + ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); + + zio->io_retries++; + zio->io_error = 0; + zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; + /* XXPOLICY */ + zio->io_flags &= ~ZIO_FLAG_FAILFAST; + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; + + dprintf("retry #%d for %s to %s offset %llx\n", + zio->io_retries, zio_type_name[zio->io_type], + vdev_description(vd), zio->io_offset); + + /* + * If this is the first retry, do it immediately. + */ + /* XXPOLICY */ + if (zio->io_retries == 1) { + zio_next_stage_async(zio); + return; + } + + /* + * This was not the first retry, so go through the + * longer enqueue/delay/vdev_reopen() process. + */ + mutex_enter(&tvd->vdev_io_lock); + ASSERT(zio->io_retry_next == NULL); + zio->io_retry_next = zq = tvd->vdev_io_retry; + tvd->vdev_io_retry = zio; + mutex_exit(&tvd->vdev_io_lock); + if (zq == NULL) + (void) taskq_dispatch( + tvd->vdev_spa->spa_vdev_retry_taskq, + zio_vdev_io_retry, tvd, TQ_SLEEP); + return; + } + + zio_next_stage(zio); +} + +void +zio_vdev_io_reissue(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_stage--; +} + +void +zio_vdev_io_redone(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); + + zio->io_stage--; +} + +void +zio_vdev_io_bypass(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_flags |= ZIO_FLAG_IO_BYPASS; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; +} + +/* + * ========================================================================== + * Generate and verify checksums + * ========================================================================== + */ +static void +zio_checksum_generate(zio_t *zio) +{ + int checksum = zio->io_checksum; + blkptr_t *bp = zio->io_bp; + + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + BP_SET_CHECKSUM(bp, checksum); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); + + zio_next_stage(zio); +} + +static void +zio_gang_checksum_generate(zio_t *zio) +{ + zio_cksum_t zc; + zio_gbh_phys_t *gbh = zio->io_data; + + ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + + zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); + + zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); + + zio_next_stage(zio); +} + +static void +zio_checksum_verify(zio_t *zio) +{ + if (zio->io_bp != NULL) { + zio->io_error = zio_checksum_error(zio); + if (zio->io_error) { + dprintf("bad checksum on vdev %s\n", + vdev_description(zio->io_vd)); + } + } + + zio_next_stage(zio); +} + +/* + * Called by RAID-Z to ensure we don't compute the checksum twice. + */ +void +zio_checksum_verified(zio_t *zio) +{ + zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); +} + +/* + * Set the external verifier for a gang block based on stuff in the bp + */ +void +zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) +{ + zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); + zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); + zcp->zc_word[2] = zio->io_bp->blk_birth; + zcp->zc_word[3] = 0; +} + +/* + * ========================================================================== + * Define the pipeline + * ========================================================================== + */ +typedef void zio_pipe_stage_t(zio_t *zio); + +static void +zio_badop(zio_t *zio) +{ + panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); +} + +zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { + zio_badop, + zio_wait_children_ready, + zio_write_compress, + zio_checksum_generate, + zio_gang_pipeline, + zio_get_gang_header, + zio_rewrite_gang_members, + zio_free_gang_members, + zio_claim_gang_members, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_gang_checksum_generate, + zio_ready, + zio_dva_translate, + zio_vdev_io_setup, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_wait_children_done, + zio_checksum_verify, + zio_read_gang_members, + zio_read_decompress, + zio_done, + zio_badop +}; + +/* + * Move an I/O to the next stage of the pipeline and execute that stage. + * There's no locking on io_stage because there's no legitimate way for + * multiple threads to be attempting to process the same I/O. + */ +void +zio_next_stage(zio_t *zio) +{ + uint32_t pipeline = zio->io_pipeline; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + + if (zio->io_error) { + dprintf("zio %p vdev %s offset %llx stage %d error %d\n", + zio, vdev_description(zio->io_vd), + zio->io_offset, zio->io_stage, zio->io_error); + if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + pipeline &= ZIO_ERROR_PIPELINE_MASK; + } + + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; + + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); + + zio_pipeline[zio->io_stage](zio); +} + +void +zio_next_stage_async(zio_t *zio) +{ + taskq_t *tq; + uint32_t pipeline = zio->io_pipeline; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + + if (zio->io_error) { + dprintf("zio %p vdev %s offset %llx stage %d error %d\n", + zio, vdev_description(zio->io_vd), + zio->io_offset, zio->io_stage, zio->io_error); + if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + pipeline &= ZIO_ERROR_PIPELINE_MASK; + } + + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; + + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); + + /* + * For performance, we'll probably want two sets of task queues: + * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU + * part is for read performance: since we have to make a pass over + * the data to checksum it anyway, we want to do this on the same CPU + * that issued the read, because (assuming CPU scheduling affinity) + * that thread is probably still there. Getting this optimization + * right avoids performance-hostile cache-to-cache transfers. + * + * Note that having two sets of task queues is also necessary for + * correctness: if all of the issue threads get bogged down waiting + * for dependent reads (e.g. metaslab freelist) to complete, then + * there won't be any threads available to service I/O completion + * interrupts. + */ + if ((1U << zio->io_stage) & zio->io_async_stages) { + if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) + tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; + else + tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; + (void) taskq_dispatch(tq, + (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); + } else { + zio_pipeline[zio->io_stage](zio); + } +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, + uint64_t txg) +{ + int error; + + spa_config_enter(spa, RW_READER); + + BP_ZERO(bp); + + error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); + + if (error == 0) { + BP_SET_CHECKSUM(bp, checksum); + BP_SET_LSIZE(bp, size); + BP_SET_PSIZE(bp, size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + bp->blk_birth = txg; + } + + spa_config_exit(spa); + + return (error); +} + +/* + * Free an intent log block. We know it can't be a gang block, so there's + * nothing to do except metaslab_free() it. + */ +void +zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) +{ + ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); + + dprintf_bp(bp, "txg %llu: ", txg); + + spa_config_enter(spa, RW_READER); + + metaslab_free(spa, BP_IDENTITY(bp), txg); + + spa_config_exit(spa); +} diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c new file mode 100644 index 0000000000..dc31527ce8 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> + +/* + * Checksum vectors. + * + * In the SPA, everything is checksummed. We support checksum vectors + * for three distinct reasons: + * + * 1. Different kinds of data need different levels of protection. + * For SPA metadata, we always want a very strong checksum. + * For user data, we let users make the trade-off between speed + * and checksum strength. + * + * 2. Cryptographic hash and MAC algorithms are an area of active research. + * It is likely that in future hash functions will be at least as strong + * as current best-of-breed, and may be substantially faster as well. + * We want the ability to take advantage of these new hashes as soon as + * they become available. + * + * 3. If someone develops hardware that can compute a strong hash quickly, + * we want the ability to take advantage of that hardware. + * + * Of course, we don't want a checksum upgrade to invalidate existing + * data, so we store the checksum *function* in five bits of the DVA. + * This gives us room for up to 32 different checksum functions. + * + * When writing a block, we always checksum it with the latest-and-greatest + * checksum function of the appropriate strength. When reading a block, + * we compare the expected checksum against the actual checksum, which we + * compute via the checksum function specified in the DVA encoding. + */ + +/*ARGSUSED*/ +static void +zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + NULL, NULL, 0, 0, "inherit", + NULL, NULL, 0, 0, "on", + zio_checksum_off, zio_checksum_off, 0, 0, "off", + zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "label", + zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "gang_header", + fletcher_2_native, fletcher_2_byteswap, 0, 1, "zilog", + fletcher_2_native, fletcher_2_byteswap, 0, 0, "fletcher2", + fletcher_4_native, fletcher_4_byteswap, 1, 0, "fletcher4", + zio_checksum_SHA256, zio_checksum_SHA256, 1, 0, "SHA256", +}; + +uint8_t +zio_checksum_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (ZIO_CHECKSUM_ON_VALUE); + + return (child); +} + +/* + * Generate the checksum. + */ +void +zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size) +{ + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t zbt_cksum; + + ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(ci->ci_func[0] != NULL); + + if (ci->ci_zbt) { + *zcp = zbt->zbt_cksum; + zbt->zbt_magic = ZBT_MAGIC; + ci->ci_func[0](data, size, &zbt_cksum); + zbt->zbt_cksum = zbt_cksum; + } else { + ci->ci_func[0](data, size, zcp); + } +} + +int +zio_checksum_error(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dva_t *dva = ZIO_GET_DVA(zio); + zio_cksum_t zc = bp->blk_cksum; + uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER : + BP_GET_CHECKSUM(bp); + int byteswap = BP_SHOULD_BYTESWAP(bp); + void *data = zio->io_data; + uint64_t size = zio->io_size; + zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (EINVAL); + + if (ci->ci_zbt) { + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_set_gang_verifier(zio, &zc); + + if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) { + expected_cksum = zbt->zbt_cksum; + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + zbt->zbt_cksum = zc; + byteswap_uint64_array(&zbt->zbt_cksum, + sizeof (zio_cksum_t)); + ci->ci_func[1](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + byteswap_uint64_array(&zbt->zbt_cksum, + sizeof (zio_cksum_t)); + } else { + expected_cksum = zbt->zbt_cksum; + zbt->zbt_cksum = zc; + ci->ci_func[0](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + } + zc = expected_cksum; + } else { + ASSERT(!DVA_GET_GANG(dva)); + ci->ci_func[byteswap](data, size, &actual_cksum); + } + + if ((actual_cksum.zc_word[0] - zc.zc_word[0]) | + (actual_cksum.zc_word[1] - zc.zc_word[1]) | + (actual_cksum.zc_word[2] - zc.zc_word[2]) | + (actual_cksum.zc_word[3] - zc.zc_word[3])) + return (ECKSUM); + + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c new file mode 100644 index 0000000000..51d85172bb --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zio_compress.c @@ -0,0 +1,134 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/compress.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_compress.h> + +/* + * Compression vectors. + */ + +zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + NULL, NULL, "inherit", + NULL, NULL, "on", + NULL, NULL, "uncompressed", + lzjb_compress, lzjb_decompress, "lzjb", +}; + +uint8_t +zio_compress_select(uint8_t child, uint8_t parent) +{ + ASSERT(child < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON); + + if (child == ZIO_COMPRESS_INHERIT) + return (parent); + + if (child == ZIO_COMPRESS_ON) + return (ZIO_COMPRESS_ON_VALUE); + + return (child); +} + +int +zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, + uint64_t *destsizep, uint64_t *destbufsizep) +{ + uint64_t *word, *word_end; + uint64_t ciosize, gapsize, destbufsize; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + char *dest; + uint_t allzero; + + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + ASSERT(ci->ci_compress != NULL); + + /* + * If the data is all zeroes, we don't even need to allocate + * a block for it. We indicate this by setting *destsizep = 0. + */ + allzero = 1; + word = src; + word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); + while (word < word_end) { + if (*word++ != 0) { + allzero = 0; + break; + } + } + if (allzero) { + *destp = NULL; + *destsizep = 0; + *destbufsizep = 0; + return (1); + } + + /* Compress at least 12.5% */ + destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); + if (destbufsize == 0) + return (0); + dest = zio_buf_alloc(destbufsize); + + ciosize = ci->ci_compress(src, dest, (size_t)srcsize, + (size_t)destbufsize); + if (ciosize > destbufsize) { + zio_buf_free(dest, destbufsize); + return (0); + } + + /* Cool. We compressed at least as much as we were hoping to. */ + + /* For security, make sure we don't write random heap crap to disk */ + gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; + if (gapsize != 0) { + bzero(dest + ciosize, gapsize); + ciosize += gapsize; + } + + ASSERT3U(ciosize, <=, destbufsize); + ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); + *destp = dest; + *destsizep = ciosize; + *destbufsizep = destbufsize; + + return (1); +} + +int +zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, + void *dest, uint64_t destsize) +{ + ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + + return (zio_compress_table[cpfunc].ci_decompress(src, dest, + srcsize, destsize)); +} diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c new file mode 100644 index 0000000000..ceb9e24d72 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -0,0 +1,793 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol/dsk/<pool_name>/<dataset_name> + * /dev/zvol/rdsk/<pool_name>/<dataset_name> + * + * These links are created by the ZFS-specific devfsadm link generator. + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/aio_req.h> +#include <sys/uio.h> +#include <sys/buf.h> +#include <sys/modctl.h> +#include <sys/open.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/dsl_prop.h> +#include <sys/dkio.h> +#include <sys/efi_partition.h> +#include <sys/byteorder.h> +#include <sys/pathname.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/crc32.h> +#include <sys/dirent.h> +#include <sys/policy.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_ioctl.h> +#include <sys/mkdev.h> + +#include "zfs_namecheck.h" + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL +#define ZVOL_MAX_MINOR MAXMIN32 + +static void *zvol_state; + +/* + * This lock protects the zvol_state structure from being modified + * while it's being used, e.g. an open that comes in before a create + * finishes. It also protects temporary opens of the dataset so that, + * e.g., an open doesn't get a spurious EBUSY. + */ +static kmutex_t zvol_state_lock; +static uint32_t zvol_minors; + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + char zv_name[MAXPATHLEN]; /* pool/dd name */ + uint64_t zv_volsize; /* amount of space we advertise */ + minor_t zv_minor; /* minor number */ + uint8_t zv_min_bs; /* minimum addressable block shift */ + uint8_t zv_readonly; /* hard readonly; like write-protect */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_mode; /* DS_MODE_* flags at open time */ + uint32_t zv_open_count[OTYPCNT]; /* open counts */ + uint32_t zv_total_opens; /* total open count */ +} zvol_state_t; + +static void +zvol_size_changed(zvol_state_t *zv, dev_t dev) +{ + dev = makedevice(getmajor(dev), zv->zv_minor); + + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Size", zv->zv_volsize) == DDI_SUCCESS); + VERIFY(ddi_prop_update_int64(dev, zfs_dip, + "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); +} + +int +zvol_check_volsize(zfs_cmd_t *zc) +{ + if (zc->zc_volsize == 0) + return (EINVAL); + + zc->zc_volsize = P2ROUNDUP(zc->zc_volsize, SPA_MAXBLOCKSIZE); +#ifdef _ILP32 + if (zc->zc_volsize - 1 > SPEC_MAXOFFSET_T) + return (EOVERFLOW); +#endif + return (0); +} + +int +zvol_check_volblocksize(zfs_cmd_t *zc) +{ + if (zc->zc_volblocksize < SPA_MINBLOCKSIZE || + zc->zc_volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(zc->zc_volblocksize)) + return (EDOM); + + return (0); +} + +static void +zvol_readonly_changed_cb(void *arg, uint64_t newval) +{ + zvol_state_t *zv = arg; + + zv->zv_readonly = (uint8_t)newval; +} + +int +zvol_get_stats(zfs_cmd_t *zc, objset_t *os) +{ + int error; + dmu_object_info_t doi; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize); + + if (error) + return (error); + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + + if (error == 0) + zc->zc_volblocksize = doi.doi_data_block_size; + + return (error); +} + +/* + * Find a free minor number. + */ +static minor_t +zvol_minor_alloc(void) +{ + minor_t minor; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) + if (ddi_get_soft_state(zvol_state, minor) == NULL) + return (minor); + + return (0); +} + +static zvol_state_t * +zvol_minor_lookup(char *name) +{ + minor_t minor; + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + continue; + if (strcmp(zv->zv_name, name) == 0) + break; + } + + return (zv); +} + +void +zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +{ + zfs_cmd_t *zc = arg; + int error; + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, zc->zc_volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize, tx); + ASSERT(error == 0); +} + +/* + * Create a minor node for the specified volume. + */ +int +zvol_create_minor(zfs_cmd_t *zc) +{ + char *name = zc->zc_name; + dev_t dev = zc->zc_dev; + zvol_state_t *zv; + objset_t *os; + uint64_t volsize; + minor_t minor = 0; + struct pathname linkpath; + int ds_mode = DS_MODE_PRIMARY; + vnode_t *vp = NULL; + char *devpath; + size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1; + char chrbuf[30], blkbuf[30]; + int error; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(name)) != NULL) { + mutex_exit(&zvol_state_lock); + return (EEXIST); + } + + if (strchr(name, '@') != 0) + ds_mode |= DS_MODE_READONLY; + + error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); + + if (error) { + mutex_exit(&zvol_state_lock); + return (error); + } + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + + if (error) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (error); + } + + /* + * If there's an existing /dev/zvol symlink, try to use the + * same minor number we used last time. + */ + devpath = kmem_alloc(devpathlen, KM_SLEEP); + + (void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name); + + error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); + + kmem_free(devpath, devpathlen); + + if (error == 0 && vp->v_type != VLNK) + error = EINVAL; + + if (error == 0) { + pn_alloc(&linkpath); + error = pn_getsymlink(vp, &linkpath, kcred); + if (error == 0) { + char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); + if (ms != NULL) { + ms += strlen(ZVOL_PSEUDO_DEV); + minor = stoi(&ms); + } + } + pn_free(&linkpath); + } + + if (vp != NULL) + VN_RELE(vp); + + /* + * If we found a minor but it's already in use, we must pick a new one. + */ + if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) + minor = 0; + + if (minor == 0) + minor = zvol_minor_alloc(); + + if (minor == 0) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, name); + + (void) sprintf(chrbuf, "%uc,raw", minor); + + if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_soft_state_free(zvol_state, minor); + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + (void) sprintf(blkbuf, "%uc", minor); + + if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, + minor, DDI_PSEUDO, 0) == DDI_FAILURE) { + ddi_remove_minor_node(zfs_dip, chrbuf); + ddi_soft_state_free(zvol_state, minor); + dmu_objset_close(os); + mutex_exit(&zvol_state_lock); + return (EAGAIN); + } + + zv = ddi_get_soft_state(zvol_state, minor); + + (void) strcpy(zv->zv_name, name); + zv->zv_min_bs = DEV_BSHIFT; + zv->zv_minor = minor; + zv->zv_volsize = volsize; + zv->zv_objset = os; + zv->zv_mode = ds_mode; + + zvol_size_changed(zv, dev); + + VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + zvol_minors++; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +/* + * Remove minor node for the specified volume. + */ +int +zvol_remove_minor(zfs_cmd_t *zc) +{ + zvol_state_t *zv; + char namebuf[30]; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (zv->zv_total_opens != 0) { + mutex_exit(&zvol_state_lock); + return (EBUSY); + } + + (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, namebuf); + + (void) sprintf(namebuf, "%uc", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, namebuf); + + VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv) == 0); + + dmu_objset_close(zv->zv_objset); + + zv->zv_objset = NULL; + + ddi_soft_state_free(zvol_state, zv->zv_minor); + + zvol_minors--; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +int +zvol_set_volsize(zfs_cmd_t *zc) +{ + zvol_state_t *zv; + dev_t dev = zc->zc_dev; + dmu_tx_t *tx; + int error; + + if ((error = zvol_check_volsize(zc)) != 0) + return (error); + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + mutex_exit(&zvol_state_lock); + return (EROFS); + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1); + dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + mutex_exit(&zvol_state_lock); + return (error); + } + + error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + &zc->zc_volsize, tx); + if (error == 0) + dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize, + DMU_OBJECT_END, tx); + + dmu_tx_commit(tx); + + if (error == 0) { + zv->zv_volsize = zc->zc_volsize; + zvol_size_changed(zv, dev); + } + + mutex_exit(&zvol_state_lock); + + return (error); +} + +int +zvol_set_volblocksize(zfs_cmd_t *zc) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + + mutex_enter(&zvol_state_lock); + + if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + mutex_exit(&zvol_state_lock); + return (EROFS); + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + zc->zc_volblocksize, 0, tx); + if (error == ENOTSUP) + error = EBUSY; + dmu_tx_commit(tx); + } + + mutex_exit(&zvol_state_lock); + + return (error); +} + +/*ARGSUSED*/ +int +zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) +{ + minor_t minor = getminor(*devp); + zvol_state_t *zv; + + if (minor == 0) /* This is the control device */ + return (0); + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + ASSERT(zv->zv_objset != NULL); + + if ((flag & FWRITE) && + (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) { + mutex_exit(&zvol_state_lock); + return (EROFS); + } + + if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { + zv->zv_open_count[otyp]++; + zv->zv_total_opens++; + } + + mutex_exit(&zvol_state_lock); + + return (0); +} + +/*ARGSUSED*/ +int +zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) +{ + minor_t minor = getminor(dev); + zvol_state_t *zv; + + if (minor == 0) /* This is the control device */ + return (0); + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + /* + * The next statement is a workaround for the following DDI bug: + * 6343604 specfs race: multiple "last-close" of the same device + */ + if (zv->zv_total_opens == 0) { + mutex_exit(&zvol_state_lock); + return (0); + } + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count[otyp] != 0); + ASSERT(zv->zv_total_opens != 0); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count[otyp]--; + zv->zv_total_opens--; + + mutex_exit(&zvol_state_lock); + + return (0); +} + +int +zvol_strategy(buf_t *bp) +{ + zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); + uint64_t off, volsize; + size_t size, resid; + char *addr; + int error = 0; + + if (zv == NULL) { + bioerror(bp, ENXIO); + biodone(bp); + return (0); + } + + if (getminor(bp->b_edev) == 0) { + bioerror(bp, EINVAL); + biodone(bp); + return (0); + } + + if (zv->zv_readonly && !(bp->b_flags & B_READ)) { + bioerror(bp, EROFS); + biodone(bp); + return (0); + } + + off = ldbtob(bp->b_blkno); + volsize = zv->zv_volsize; + + ASSERT(zv->zv_objset != NULL); + + bp_mapin(bp); + addr = bp->b_un.b_addr; + resid = bp->b_bcount; + + while (resid != 0 && off < volsize) { + + size = MIN(resid, 1UL << 20); /* cap at 1MB per tx */ + + if (size > volsize - off) /* don't write past the end */ + size = volsize - off; + + if (bp->b_flags & B_READ) { + error = dmu_read_canfail(zv->zv_objset, ZVOL_OBJ, + off, size, addr); + } else { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(zv->zv_objset, ZVOL_OBJ, + off, size, addr, tx); + dmu_tx_commit(tx); + } + } + if (error) + break; + off += size; + addr += size; + resid -= size; + } + + if ((bp->b_resid = resid) == bp->b_bcount) + bioerror(bp, off > volsize ? EINVAL : error); + + biodone(bp); + return (0); +} + +/*ARGSUSED*/ +int +zvol_read(dev_t dev, uio_t *uiop, cred_t *cr) +{ + return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop)); +} + +/*ARGSUSED*/ +int +zvol_write(dev_t dev, uio_t *uiop, cred_t *cr) +{ + return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop)); +} + +/*ARGSUSED*/ +int +zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr) +{ + return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio)); +} + +/*ARGSUSED*/ +int +zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr) +{ + return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio)); +} + +/* + * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). + */ +/*ARGSUSED*/ +int +zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zvol_state_t *zv; + struct dk_cinfo dkc; + struct dk_minfo dkm; + dk_efi_t efi; + efi_gpt_t gpt; + efi_gpe_t gpe; + struct uuid uuid = EFI_RESERVED; + uint32_t crc; + int error = 0; + + mutex_enter(&zvol_state_lock); + + zv = ddi_get_soft_state(zvol_state, getminor(dev)); + + if (zv == NULL) { + mutex_exit(&zvol_state_lock); + return (ENXIO); + } + + switch (cmd) { + + case DKIOCINFO: + bzero(&dkc, sizeof (dkc)); + (void) strcpy(dkc.dki_cname, "zvol"); + (void) strcpy(dkc.dki_dname, "zvol"); + dkc.dki_ctype = DKC_UNKNOWN; + dkc.dki_maxtransfer = 1 << 15; + mutex_exit(&zvol_state_lock); + if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag)) + error = EFAULT; + return (error); + + case DKIOCGMEDIAINFO: + bzero(&dkm, sizeof (dkm)); + dkm.dki_lbsize = 1U << zv->zv_min_bs; + dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; + dkm.dki_media_type = DK_UNKNOWN; + mutex_exit(&zvol_state_lock); + if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) + error = EFAULT; + return (error); + + case DKIOCGETEFI: + if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) { + mutex_exit(&zvol_state_lock); + return (EFAULT); + } + + bzero(&gpt, sizeof (gpt)); + bzero(&gpe, sizeof (gpe)); + + efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; + + if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) { + mutex_exit(&zvol_state_lock); + return (EINVAL); + } + + efi.dki_length = sizeof (gpt) + sizeof (gpe); + + gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); + gpt.efi_gpt_Revision = LE_32(EFI_VERSION102); + gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); + gpt.efi_gpt_FirstUsableLBA = LE_64(0ULL); + gpt.efi_gpt_LastUsableLBA = + LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); + gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); + gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe)); + + UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); + gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA; + gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA; + + CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); + gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); + + CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); + gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); + + mutex_exit(&zvol_state_lock); + if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag) || + ddi_copyout(&gpe, efi.dki_data + 1, sizeof (gpe), flag)) + error = EFAULT; + return (error); + + default: + error = ENOTSUP; + break; + + } + mutex_exit(&zvol_state_lock); + return (error); +} + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +void +zvol_init(void) +{ + VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zvol_fini(void) +{ + mutex_destroy(&zvol_state_lock); + ddi_soft_state_fini(&zvol_state); +} |