summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/zdb/zdb.c8
-rw-r--r--usr/src/cmd/zinject/translate.c4
-rw-r--r--usr/src/lib/libavl/mapfile-vers4
-rw-r--r--usr/src/lib/libzpool/common/sys/zfs_context.h20
-rw-r--r--usr/src/uts/common/Makefile.files3
-rw-r--r--usr/src/uts/common/fs/dnlc.c27
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c293
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c202
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_object.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c109
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c35
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c562
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c34
-rw-r--r--usr/src/uts/common/fs/zfs/refcount.c35
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h60
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h20
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h31
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/sa_impl.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zrlock.h66
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c20
-rw-r--r--usr/src/uts/common/fs/zfs/zrlock.c194
-rw-r--r--usr/src/uts/common/sys/dnlc.h25
-rw-r--r--usr/src/uts/common/sys/kmem.h9
-rw-r--r--usr/src/uts/intel/io/dktp/dcdev/dadk.c11
-rw-r--r--usr/src/uts/sun/io/dada/targets/dad.c10
29 files changed, 1491 insertions, 321 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 31681b2d5a..56686fe30a 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -1459,7 +1459,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
if (object == 0) {
- dn = os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
} else {
error = dmu_bonus_hold(os, object, FTAG, &db);
if (error)
@@ -1467,7 +1467,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
object, error);
bonus = db->db_data;
bsize = db->db_size;
- dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dn = DB_DNODE((dmu_buf_impl_t *)db);
}
dmu_object_info_from_dnode(dn, &doi);
@@ -1631,8 +1631,8 @@ dump_dir(objset_t *os)
dump_object(os, 0, verbosity, &print_header);
object_count = 0;
- if (os->os_userused_dnode &&
- os->os_userused_dnode->dn_type != 0) {
+ if (DMU_USERUSED_DNODE(os) != NULL &&
+ DMU_USERUSED_DNODE(os)->dn_type != 0) {
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
}
diff --git a/usr/src/cmd/zinject/translate.c b/usr/src/cmd/zinject/translate.c
index cd967a8451..87751e315e 100644
--- a/usr/src/cmd/zinject/translate.c
+++ b/usr/src/cmd/zinject/translate.c
@@ -267,7 +267,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
}
if (record->zi_object == 0) {
- dn = os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
} else {
err = dnode_hold(os, record->zi_object, FTAG, &dn);
if (err != 0) {
@@ -318,7 +318,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
ret = 0;
out:
if (dn) {
- if (dn != os->os_meta_dnode)
+ if (dn != DMU_META_DNODE(os))
dnode_rele(dn, FTAG);
}
if (os)
diff --git a/usr/src/lib/libavl/mapfile-vers b/usr/src/lib/libavl/mapfile-vers
index 97433d8c4c..2f5c4641ee 100644
--- a/usr/src/lib/libavl/mapfile-vers
+++ b/usr/src/lib/libavl/mapfile-vers
@@ -19,8 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
#
#
@@ -47,6 +46,7 @@ SUNWprivate_1.1 {
avl_first;
avl_insert;
avl_insert_here;
+ avl_is_empty;
avl_last;
avl_nearest;
avl_numnodes;
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
index 9a6d712e53..51130b86d8 100644
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_CONTEXT_H
@@ -231,8 +230,10 @@ typedef struct kmutex {
} kmutex_t;
#define MUTEX_DEFAULT USYNC_THREAD
-#undef MUTEX_HELD
+#undef MUTEX_HELD
+#undef MUTEX_NOT_HELD
#define MUTEX_HELD(m) _mutex_held(&(m)->m_lock)
+#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
/*
* Argh -- we have to get cheesy here because the kernel and userland
@@ -323,10 +324,21 @@ extern void kstat_delete(kstat_t *);
#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b)
#define kmem_debugging() 0
-#define kmem_cache_reap_now(c)
+#define kmem_cache_reap_now(_c) /* nothing */
+#define kmem_cache_set_move(_c, _cb) /* nothing */
+#define POINTER_INVALIDATE(_pp) /* nothing */
+#define POINTER_IS_VALID(_p) 0
typedef umem_cache_t kmem_cache_t;
+typedef enum kmem_cbrc {
+ KMEM_CBRC_YES,
+ KMEM_CBRC_NO,
+ KMEM_CBRC_LATER,
+ KMEM_CBRC_DONT_NEED,
+ KMEM_CBRC_DONT_KNOW
+} kmem_cbrc_t;
+
/*
* Task queues
*/
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index a3f07f4cc4..ddad8ba466 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1380,7 +1380,8 @@ ZFS_COMMON_OBJS += \
zio_checksum.o \
zio_compress.o \
zio_inject.o \
- zle.o
+ zle.o \
+ zrlock.o
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c
index 0941dfb9ac..b45e3b17cb 100644
--- a/usr/src/uts/common/fs/dnlc.c
+++ b/usr/src/uts/common/fs/dnlc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -278,7 +277,8 @@ vnode_t negative_cache_vnode;
*/
#define DNLC_DIR_HASH(name, hash, namelen) \
{ \
- char Xc, *Xcp; \
+ char Xc; \
+ const char *Xcp; \
hash = *name; \
for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \
hash = (hash << 4) + hash + Xc; \
@@ -322,7 +322,8 @@ static dchead_t dc_head; /* anchor of cached directories */
/* Prototypes */
static ncache_t *dnlc_get(uchar_t namlen);
-static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash);
+static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen,
+ int hash);
static void dnlc_dir_reclaim(void *unused);
static void dnlc_dir_abort(dircache_t *dcp);
static void dnlc_dir_adjust_fhash(dircache_t *dcp);
@@ -431,7 +432,7 @@ dnlc_init()
* Add a name to the directory cache.
*/
void
-dnlc_enter(vnode_t *dp, char *name, vnode_t *vp)
+dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
{
ncache_t *ncp;
nc_hash_t *hp;
@@ -497,7 +498,7 @@ dnlc_enter(vnode_t *dp, char *name, vnode_t *vp)
* it just frees up the newly allocated dnlc entry.
*/
void
-dnlc_update(vnode_t *dp, char *name, vnode_t *vp)
+dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
{
ncache_t *ncp;
ncache_t *tcp;
@@ -579,7 +580,7 @@ dnlc_update(vnode_t *dp, char *name, vnode_t *vp)
* lost before the caller can use the vnode.
*/
vnode_t *
-dnlc_lookup(vnode_t *dp, char *name)
+dnlc_lookup(vnode_t *dp, const char *name)
{
ncache_t *ncp;
nc_hash_t *hp;
@@ -660,7 +661,7 @@ dnlc_lookup(vnode_t *dp, char *name)
* Remove an entry in the directory name cache.
*/
void
-dnlc_remove(vnode_t *dp, char *name)
+dnlc_remove(vnode_t *dp, const char *name)
{
ncache_t *ncp;
nc_hash_t *hp;
@@ -968,7 +969,7 @@ dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
* ncache entry if found, NULL otherwise.
*/
static ncache_t *
-dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash)
+dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash)
{
nc_hash_t *hp;
ncache_t *ncp;
@@ -1141,7 +1142,7 @@ found:
* Lookup up an entry in a complete or partial directory cache.
*/
dcret_t
-dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle)
+dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle)
{
dircache_t *dcp;
dcentry_t *dep;
@@ -1282,7 +1283,7 @@ error:
* Add a directopry entry to a partial or complete directory cache.
*/
dcret_t
-dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle)
+dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
{
dircache_t *dcp;
dcentry_t **hp, *dep;
@@ -1583,7 +1584,7 @@ dnlc_dir_purge(dcanchor_t *dcap)
* Return the handle if it's non null.
*/
dcret_t
-dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep)
+dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
{
dircache_t *dcp;
dcentry_t **prevpp, *te;
@@ -1782,7 +1783,7 @@ dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle)
* Update the handle of an directory cache entry.
*/
dcret_t
-dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle)
+dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle)
{
dircache_t *dcp;
dcentry_t *dep;
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index f3388e1360..b73dbdcfd7 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -217,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db)
db->db_evict_func = NULL;
}
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+ if (db->db_level > 0) {
+ return (B_TRUE);
+ } else {
+ boolean_t is_metadata;
+
+ DB_DNODE_ENTER(db);
+ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
+ DB_DNODE_EXIT(db);
+
+ return (is_metadata);
+ }
+}
+
void
dbuf_evict(dmu_buf_impl_t *db)
{
@@ -281,7 +297,7 @@ dbuf_fini(void)
static void
dbuf_verify(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db)
return;
ASSERT(db->db_objset != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if (dn == NULL) {
ASSERT(db->db_parent == NULL);
ASSERT(db->db_blkptr == NULL);
@@ -297,8 +315,9 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
- DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID ||
+ !list_is_empty(&dn->dn_dbufs));
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
@@ -355,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db)
* have the struct_rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
@@ -380,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -424,8 +444,11 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
+ spa_t *spa;
+
mutex_exit(&db->db_mtx);
- abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+ DB_GET_SPA(&spa, db);
+ abuf = arc_loan_buf(spa, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
@@ -484,11 +507,14 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
arc_buf_t *pbuf;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -506,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ DB_DNODE_EXIT(db);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -524,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
+ DB_DNODE_EXIT(db);
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -531,6 +559,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
+ spa = dn->dn_objset->os_spa;
+ DB_DNODE_EXIT(db);
+
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
@@ -549,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
else
pbuf = db->db_objset->os_phys_buf;
- (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+ (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -563,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
+ dnode_t *dn;
/*
* We don't have to hold the mutex to check db_state because it
@@ -573,46 +605,51 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_NOFILL)
return (EIO);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
- if (zio == NULL) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- }
+ spa_t *spa = dn->dn_objset->os_spa;
+
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
dbuf_read_impl(db, zio, &flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
if (!havepzio)
err = zio_wait(zio);
} else {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
@@ -642,11 +679,12 @@ dbuf_noread(dmu_buf_impl_t *db)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
+ DB_GET_SPA(&spa, db);
+ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_set_data(db, NULL);
@@ -687,7 +725,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
/*
* If the last dirty record for this dbuf has not yet synced
* and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
+ * reset the reference to point to a new copy,
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
@@ -700,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dr->dt.dl.dr_data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
+ spa_t *spa;
+
+ DB_GET_SPA(&spa, db);
+ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -726,9 +766,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
- if (!BP_IS_HOLE(bp))
- zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
+ if (!BP_IS_HOLE(bp)) {
+ spa_t *spa;
+ DB_GET_SPA(&spa, db);
+ zio_free(spa, txg, bp);
+ }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
* Release the already-written buffer, so we leave it in
@@ -884,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dnode_t *dn;
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
/* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
@@ -903,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -923,15 +970,17 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);
- dnode_willuse_space(db->db_dnode, size-osize, tx);
+ dnode_willuse_space(dn, size-osize, tx);
+ DB_DNODE_EXIT(db);
}
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
- objset_t *os = db->db_dnode->dn_objset;
+ objset_t *os;
zbookmark_t zb;
+ DB_GET_OBJSET(&os, db);
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -949,8 +998,8 @@ dbuf_release_bp(dmu_buf_impl_t *db)
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
@@ -960,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
DMU_TX_DIRTY_BUF(tx, db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
/*
* Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they
@@ -1014,6 +1065,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
+ DB_DNODE_EXIT(db);
+
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
@@ -1049,6 +1102,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* we already dirtied it in open context. Hence we must make
* this assertion only if we're not already dirty.
*/
+ os = dn->dn_objset;
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
@@ -1137,6 +1191,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
} else if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
@@ -1199,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_level+1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL ||
- db->db_parent == db->db_dnode->dn_dbuf);
+ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1210,13 +1264,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
dnode_setdirty(dn, tx);
+ DB_DNODE_EXIT(db);
return (dr);
}
static int
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
uint64_t txg = tx->tx_txg;
dbuf_dirty_record_t *dr, **drp;
@@ -1237,6 +1292,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
/*
* If this buffer is currently held, we cannot undirty
* it, since one of the current holders may be in the
@@ -1249,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1270,6 +1329,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
+ DB_DNODE_EXIT(db);
if (db->db_level == 0) {
if (db->db_state != DB_NOFILL) {
@@ -1315,8 +1375,10 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ DB_DNODE_ENTER(db);
+ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
+ DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_dirty(db, tx);
}
@@ -1378,7 +1440,6 @@ void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
@@ -1442,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
* in this case. For callers from the DMU we will usually see:
* dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
* For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
* Sometimes, though, we will get a mix of these two:
* DMU: dbuf_clear()->arc_buf_evict()
* ARC: dbuf_do_evict()->dbuf_destroy()
@@ -1450,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
void
dbuf_clear(dmu_buf_impl_t *db)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ dmu_buf_impl_t *dndb;
int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1476,10 +1537,26 @@ dbuf_clear(dmu_buf_impl_t *db)
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dndb = dn->dn_dbuf;
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ membar_producer();
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold corresponding
+ * to the removed dbuf is no longer discounted in dnode_move(),
+ * so the dnode cannot be moved until after we release the hold.
+ * The membar_producer() ensures visibility of the decremented
+ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+ * release any lock.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
+ } else {
+ DB_DNODE_EXIT(db);
}
if (db->db_buf)
@@ -1489,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
/*
- * If this dbuf is referened from an indirect dbuf,
+ * If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
if (parent && parent != dndb)
@@ -1581,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkid = blkid;
db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
- db->db_dnode = dn;
+ db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
@@ -1638,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
@@ -1677,15 +1755,24 @@ dbuf_destroy(dmu_buf_impl_t *db)
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (db->db_dnode) {
- dnode_t *dn = db->db_dnode;
+ if (db->db_dnode_handle != NULL) {
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
+ (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
-
+ DB_DNODE_EXIT(db);
+ /*
+ * Decrementing the dbuf count means that the hold
+ * corresponding to the removed dbuf is no longer
+ * discounted in dnode_move(), so the dnode cannot be
+ * moved until after we release the hold.
+ */
dnode_rele(dn, db);
- db->db_dnode = NULL;
+ db->db_dnode_handle = NULL;
}
dbuf_hash_remove(db);
}
@@ -1824,7 +1911,7 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
@@ -1840,7 +1927,7 @@ top:
if (parent)
dbuf_rele(parent, NULL);
- ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;
@@ -1877,6 +1964,8 @@ int
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
if (db->db_blkid != DMU_SPILL_BLKID)
return (ENOTSUP);
if (blksz == 0)
@@ -1886,9 +1975,12 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
else
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dbuf_new_size(db, blksz, tx);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(db);
return (0);
}
@@ -1907,6 +1999,13 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
ASSERT(holds > 1);
}
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
@@ -1927,6 +2026,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
+ /*
+ * Remove the reference to the dbuf before removing its hold on the
+ * dnode so we can guarantee in dnode_move() that a referenced bonus
+ * buffer has a corresponding dnode hold.
+ */
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
@@ -1944,7 +2048,20 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
mutex_exit(&db->db_mtx);
- dnode_rele(db->db_dnode, db);
+
+ /*
+ * If the dnode moves here, we cannot cross this barrier
+ * until the move completes.
+ */
+ DB_DNODE_ENTER(db);
+ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+ DB_DNODE_EXIT(db);
+ /*
+ * The bonus buffer's dnode hold is no longer discounted
+ * in dnode_move(). The dnode cannot move until after
+ * the dnode_rele().
+ */
+ dnode_rele(DB_DNODE(db), db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
@@ -2095,7 +2212,7 @@ static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
zio_t *zio;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2113,10 +2230,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
}
ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT(db->db_buf != NULL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
+ DB_DNODE_EXIT(db);
db->db_data_pending = dr;
@@ -2136,8 +2256,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
@@ -2160,6 +2280,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
DBUF_VERIFY(db);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
if (db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
@@ -2179,6 +2302,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ DB_DNODE_EXIT(db);
+
if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2197,6 +2322,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
return;
}
+ os = dn->dn_objset;
+
/*
* This function may have dropped the db_mtx lock allowing a dmu_sync
* operation to sneak in. As a result, we need to ensure that we
@@ -2206,7 +2333,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_check_blkptr(dn, db);
/*
- * If this buffer is in the middle of an immdiate write,
+ * If this buffer is in the middle of an immediate write,
* wait for the synchronous IO to complete.
*/
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2243,10 +2370,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- else
+ DB_DNODE_EXIT(db);
+ } else {
+ /*
+ * Although zio_nowait() does not "wait for an IO", it does
+ * initiate the IO. If this is an empty write it seems plausible
+ * that the IO could actually be completed before the nowait
+ * returns. We need to DB_DNODE_EXIT() first in case
+ * zio_nowait() invalidates the dbuf.
+ */
+ DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
+ }
}
void
@@ -2280,9 +2417,9 @@ static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
- dnode_t *dn = db->db_dnode;
spa_t *spa = zio->io_spa;
int64_t delta;
uint64_t fill = 0;
@@ -2290,12 +2427,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_blkptr == bp);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
ASSERT(bp->blk_fill == 0);
+ DB_DNODE_EXIT(db);
return;
}
@@ -2309,7 +2449,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn = db->db_dnode;
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
@@ -2342,6 +2481,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill += ibp->blk_fill;
}
}
+ DB_DNODE_EXIT(db);
bp->blk_fill = fill;
@@ -2355,8 +2495,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_impl_t *db = vdb;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
uint64_t txg = zio->io_txg;
dbuf_dirty_record_t **drp, *dr;
@@ -2366,8 +2504,13 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dmu_tx_t *tx;
+
+ DB_GET_OBJSET(&os, db);
+ ds = os->os_dsl_dataset;
+ tx = os->os_synctx;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
@@ -2388,10 +2531,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
+ DB_DNODE_EXIT(db);
}
#endif
@@ -2406,6 +2553,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
} else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2417,6 +2568,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
+ DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
@@ -2472,8 +2624,8 @@ static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
+ dnode_t *dn;
+ objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_t zb;
@@ -2481,6 +2633,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
zio_t *zio;
int wp_flag = 0;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ os = dn->dn_objset;
+
if (db->db_state != DB_NOFILL) {
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
/*
@@ -2525,6 +2681,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ DB_DNODE_EXIT(db);
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
ASSERT(db->db_state != DB_NOFILL);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 354ce3dc91..39234eba53 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -133,7 +133,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
}
dnode_rele(dn, FTAG);
- *dbp = &db->db;
+ *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -144,31 +144,64 @@ dmu_bonus_max(void)
}
int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
- if (newsize < 0 || newsize > db->db_size)
- return (EINVAL);
- dnode_setbonuslen(dn, newsize, tx);
- return (0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else if (newsize < 0 || newsize > db_fake->db_size) {
+ error = EINVAL;
+ } else {
+ dnode_setbonuslen(dn, newsize, tx);
+ error = 0;
+ }
+
+ DB_DNODE_EXIT(db);
+ return (error);
}
int
-dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ int error;
- if (type > DMU_OT_NUMTYPES)
- return (EINVAL);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (type > DMU_OT_NUMTYPES) {
+ error = EINVAL;
+ } else if (dn->dn_bonus != db) {
+ error = EINVAL;
+ } else {
+ dnode_setbonus_type(dn, type, tx);
+ error = 0;
+ }
- if (dn->dn_bonus != (dmu_buf_impl_t *)db)
- return (EINVAL);
+ DB_DNODE_EXIT(db);
+ return (error);
+}
- dnode_setbonus_type(dn, type, tx);
- return (0);
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ type = dn->dn_bonustype;
+ DB_DNODE_EXIT(db);
+
+ return (type);
}
int
@@ -208,11 +241,19 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
/* as long as the bonus buf is held, the dnode will be held */
- if (refcount_add(&db->db_holds, tag) == 1)
+ if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
+ (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ }
+
+ /*
+ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+ * hold and incrementing the dbuf count to ensure that dnode_move() sees
+ * a dnode hold for every dbuf.
+ */
+ rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
@@ -257,28 +298,45 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
int
dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
int err;
- if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
- return (EINVAL);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+ err = EINVAL;
+ } else {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ err = ENOENT;
+ } else {
+ err = dmu_spill_hold_by_dnode(dn,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+ }
- if (!dn->dn_have_spill) {
rw_exit(&dn->dn_struct_rwlock);
- return (ENOENT);
}
- err = dmu_spill_hold_by_dnode(dn,
- DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
- rw_exit(&dn->dn_struct_rwlock);
+
+ DB_DNODE_EXIT(db);
return (err);
}
int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
{
- return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
- DB_RF_CANFAIL, tag, dbp));
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+ dnode_t *dn;
+ int err;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+ DB_DNODE_EXIT(db);
+
+ return (err);
}
/*
@@ -400,14 +458,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
}
int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
int err;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp, DMU_READ_PREFETCH);
+ DB_DNODE_EXIT(db);
return (err);
}
@@ -440,7 +502,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
if (len == 0) { /* they're interested in the bonus buffer */
- dn = os->os_meta_dnode;
+ dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
@@ -1001,11 +1063,19 @@ int
dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
if (size == 0)
return (0);
- return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode,
- uio, size, tx));
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_write_uio_dnode(dn, uio, size, tx);
+ DB_DNODE_EXIT(db);
+
+ return (err);
}
int
@@ -1091,9 +1161,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+ spa_t *spa;
- return (arc_loan_buf(dn->dn_objset->os_spa, size));
+ DB_GET_SPA(&spa, db);
+ return (arc_loan_buf(spa, size));
}
/*
@@ -1115,23 +1187,35 @@ void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+ dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_size(buf);
uint64_t blkid;
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
+ DB_DNODE_EXIT(dbuf);
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
+ objset_t *os;
+ uint64_t object;
+
+ DB_DNODE_ENTER(dbuf);
+ dn = DB_DNODE(dbuf);
+ os = dn->dn_objset;
+ object = dn->dn_object;
+ DB_DNODE_EXIT(dbuf);
+
dbuf_rele(db, FTAG);
- dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
- buf->b_data, tx);
+ dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
@@ -1150,7 +1234,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
blkptr_t *bp = zio->io_bp;
if (zio->io_error == 0) {
@@ -1161,7 +1244,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
*/
BP_SET_LSIZE(bp, db->db_size);
} else {
- ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@@ -1284,6 +1366,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dmu_sync_arg_t *dsa;
zbookmark_t zb;
zio_prop_t zp;
+ dnode_t *dn;
ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
@@ -1292,7 +1375,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid);
- dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ DB_DNODE_EXIT(db);
/*
* If we're frozen (running ziltest), we always need to generate a bp.
@@ -1574,9 +1660,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
* As above, but faster; can be used when you have a held dbuf in hand.
*/
void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
{
- dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ DB_DNODE_ENTER(db);
+ dmu_object_info_from_dnode(DB_DNODE(db), doi);
+ DB_DNODE_EXIT(db);
}
/*
@@ -1584,14 +1674,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
* This is specifically optimized for zfs_getattr().
*/
void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+ u_longlong_t *nblk512)
{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
/* add 1 for dnode space */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
SPA_MINBLOCKSHIFT) + 1;
+ DB_DNODE_EXIT(db);
}
void
@@ -1643,23 +1739,25 @@ void
dmu_init(void)
{
zfs_dbgmsg_init();
- dbuf_init();
+ sa_cache_init();
+ xuio_stat_init();
+ dmu_objset_init();
dnode_init();
+ dbuf_init();
zfetch_init();
arc_init();
l2arc_init();
- xuio_stat_init();
- sa_cache_init();
}
void
dmu_fini(void)
{
+ l2arc_fini();
arc_fini();
zfetch_fini();
- dnode_fini();
dbuf_fini();
- l2arc_fini();
+ dnode_fini();
+ dmu_objset_fini();
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index 98228d4035..8dff460489 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -33,7 +33,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
{
uint64_t object;
uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
- (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
int restarted = B_FALSE;
@@ -49,7 +49,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
*/
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
- int error = dnode_next_offset(os->os_meta_dnode,
+ int error = dnode_next_offset(DMU_META_DNODE(os),
DNODE_FIND_HOLE,
&offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
@@ -187,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
int error;
- error = dnode_next_offset(os->os_meta_dnode,
+ error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index a6d9b7a54a..5554bda8f9 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -41,9 +41,26 @@
#include <sys/zil.h>
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
-#include <sys/sunddi.h>
#include <sys/sa.h>
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+void
+dmu_objset_init(void)
+{
+ rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+ rw_destroy(&os_lock);
+}
+
spa_t *
dmu_objset_spa(objset_t *os)
{
@@ -368,13 +385,16 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
- os->os_meta_dnode = dnode_special_open(os,
- &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+ DMU_META_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
+ &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
- os->os_userused_dnode = dnode_special_open(os,
- &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
- os->os_groupused_dnode = dnode_special_open(os,
- &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+ DMU_USERUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
+ &os->os_userused_dnode);
+ DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
+ &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
+ &os->os_groupused_dnode);
}
/*
@@ -470,8 +490,8 @@ dmu_objset_evict_dbufs(objset_t *os)
mutex_enter(&os->os_lock);
/* process the mdn last, since the other dnodes have holds on it */
- list_remove(&os->os_dnodes, os->os_meta_dnode);
- list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
+ list_remove(&os->os_dnodes, DMU_META_DNODE(os));
+ list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
/*
* Find the first dnode with holds. We have to do this dance
@@ -497,8 +517,9 @@ dmu_objset_evict_dbufs(objset_t *os)
mutex_enter(&os->os_lock);
dn = next_dn;
}
+ dn = list_head(&os->os_dnodes);
mutex_exit(&os->os_lock);
- return (list_head(&os->os_dnodes) != os->os_meta_dnode);
+ return (dn != DMU_META_DNODE(os));
}
void
@@ -539,16 +560,26 @@ dmu_objset_evict(objset_t *os)
*/
(void) dmu_objset_evict_dbufs(os);
- dnode_special_close(os->os_meta_dnode);
- if (os->os_userused_dnode) {
- dnode_special_close(os->os_userused_dnode);
- dnode_special_close(os->os_groupused_dnode);
+ dnode_special_close(&os->os_meta_dnode);
+ if (DMU_USERUSED_DNODE(os)) {
+ dnode_special_close(&os->os_userused_dnode);
+ dnode_special_close(&os->os_groupused_dnode);
}
zil_free(os->os_zil);
ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+
+ /*
+ * This is a barrier to prevent the objset from going away in
+ * dnode_move() until we can safely ensure that the objset is still in
+ * use. We consider the objset valid before the barrier and invalid
+ * after the barrier.
+ */
+ rw_enter(&os_lock, RW_READER);
+ rw_exit(&os_lock);
+
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
@@ -575,7 +606,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
if (ds)
mutex_exit(&ds->ds_opening_lock);
- mdn = os->os_meta_dnode;
+ mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -1035,17 +1066,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
/*
* Sync special dnodes - the parent IO for the sync is the root block
*/
- os->os_meta_dnode->dn_zio = zio;
- dnode_sync(os->os_meta_dnode, tx);
+ DMU_META_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_META_DNODE(os), tx);
os->os_phys->os_flags = os->os_flags;
- if (os->os_userused_dnode &&
- os->os_userused_dnode->dn_type != DMU_OT_NONE) {
- os->os_userused_dnode->dn_zio = zio;
- dnode_sync(os->os_userused_dnode, tx);
- os->os_groupused_dnode->dn_zio = zio;
- dnode_sync(os->os_groupused_dnode, tx);
+ if (DMU_USERUSED_DNODE(os) &&
+ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+ DMU_USERUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_USERUSED_DNODE(os), tx);
+ DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+ dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
}
txgoff = tx->tx_txg & TXG_MASK;
@@ -1063,7 +1094,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
- list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while (dr = list_head(list)) {
ASSERT(dr->dr_dbuf->db_level == 0);
list_remove(list, dr);
@@ -1085,7 +1116,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg)
!list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
}
-objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1097,8 +1128,8 @@ boolean_t
dmu_objset_userused_enabled(objset_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
- used_cbs[os->os_phys->os_type] &&
- os->os_userused_dnode);
+ used_cbs[os->os_phys->os_type] != NULL &&
+ DMU_USERUSED_DNODE(os) != NULL);
}
static void
@@ -1132,7 +1163,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
DNODE_FLAG_USERUSED_ACCOUNTED);
/* Allocate the user/groupused objects if necessary. */
- if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY(0 == zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
@@ -1201,13 +1232,23 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (dr->dr_txg == tx->tx_txg)
break;
- if (dr == NULL)
+ if (dr == NULL) {
data = NULL;
- else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 &&
- dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
- data = dr->dt.dl.dr_data->b_data;
- else
- data = dr->dt.dl.dr_data;
+ } else {
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(dr->dr_dbuf);
+ dn = DB_DNODE(dr->dr_dbuf);
+
+ if (dn->dn_bonuslen == 0 &&
+ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+ data = dr->dt.dl.dr_data->b_data;
+ else
+ data = dr->dt.dl.dr_data;
+
+ DB_DNODE_EXIT(dr->dr_dbuf);
+ }
+
return (data);
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 5fc062c16b..bd5c71a226 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -186,7 +186,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
ASSERT(level != 0);
db = NULL;
} else {
- ASSERT(db->db_dnode == dn);
+ ASSERT(DB_DNODE(db) == dn);
ASSERT(db->db_level == level);
ASSERT(db->db.db_size == space);
ASSERT(db->db_blkid == blkid);
@@ -384,7 +384,7 @@ static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
dnode_t *dn = txh->txh_dnode;
- dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
+ dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
uint64_t space = mdn->dn_datablksz +
((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
@@ -787,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
{
dmu_tx_hold_t *txh;
int match_object = FALSE, match_offset = FALSE;
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
ASSERT(tx->tx_txg != 0);
ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
ASSERT3U(dn->dn_object, ==, db->db.db_object);
- if (tx->tx_anyobj)
+ if (tx->tx_anyobj) {
+ DB_DNODE_EXIT(db);
return;
+ }
/* XXX No checking on the meta dnode for now */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ DB_DNODE_EXIT(db);
return;
+ }
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
@@ -870,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
ASSERT(!"bad txh_type");
}
}
- if (match_object && match_offset)
+ if (match_object && match_offset) {
+ DB_DNODE_EXIT(db);
return;
+ }
}
+ DB_DNODE_EXIT(db);
panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
(u_longlong_t)db->db.db_object, db->db_level,
(u_longlong_t)db->db_blkid);
@@ -1355,9 +1364,19 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
- if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
- ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
ASSERT(tx->tx_txg == 0);
dmu_tx_hold_spill(tx, object);
+ } else {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+ DB_DNODE_EXIT(db);
}
}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 2b44cd2c96..850dd5816b 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -38,19 +38,33 @@
static int free_range_compar(const void *node1, const void *node2);
static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define DNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define DNODE_STAT_ADD(stat) /* nothing */
+#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
- int i;
dnode_t *dn = arg;
- bzero(dn, sizeof (dnode_t));
+ int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -59,8 +73,18 @@ dnode_cons(void *arg, void *unused, int kmflag)
refcount_create(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
+ list_link_init(&dn->dn_link);
+
+ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
+ list_link_init(&dn->dn_dirty_link[i]);
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
@@ -69,9 +93,27 @@ dnode_cons(void *arg, void *unused, int kmflag)
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+ dn->dn_dirtyctx = 0;
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_bonus = NULL;
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_zio = NULL;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dn->dn_dbufs_count = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
+ dn->dn_moved = 0;
return (0);
}
@@ -88,27 +130,56 @@ dnode_dest(void *arg, void *unused)
cv_destroy(&dn->dn_notxholds);
refcount_destroy(&dn->dn_holds);
refcount_destroy(&dn->dn_tx_holds);
+ ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
avl_destroy(&dn->dn_ranges[i]);
list_destroy(&dn->dn_dirty_records[i]);
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
}
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_free_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT3U(dn->dn_dirtyctx, ==, 0);
+ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+ ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT(!dn->dn_have_spill);
+ ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT3U(dn->dn_oldused, ==, 0);
+ ASSERT3U(dn->dn_oldflags, ==, 0);
+ ASSERT3U(dn->dn_olduid, ==, 0);
+ ASSERT3U(dn->dn_oldgid, ==, 0);
+ ASSERT3U(dn->dn_newuid, ==, 0);
+ ASSERT3U(dn->dn_newgid, ==, 0);
+ ASSERT3U(dn->dn_id_flags, ==, 0);
+
+ ASSERT3U(dn->dn_dbufs_count, ==, 0);
list_destroy(&dn->dn_dbufs);
}
void
dnode_init(void)
{
+ ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t",
sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+ kmem_cache_set_move(dnode_cache, dnode_move);
}
void
dnode_fini(void)
{
kmem_cache_destroy(dnode_cache);
+ dnode_cache = NULL;
}
@@ -120,6 +191,7 @@ dnode_verify(dnode_t *dn)
ASSERT(dn->dn_phys);
ASSERT(dn->dn_objset);
+ ASSERT(dn->dn_handle->dnh_dnode == dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
@@ -298,18 +370,29 @@ dnode_setdblksz(dnode_t *dn, int size)
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
- uint64_t object)
+ uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
- (void) dnode_cons(dn, NULL, 0); /* XXX */
- dn->dn_objset = os;
+ ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+ dn->dn_moved = 0;
+
+ /*
+ * Defer setting dn_objset until the dnode is ready to be a candidate
+ * for the dnode_move() callback.
+ */
dn->dn_object = object;
dn->dn_dbuf = db;
+ dn->dn_handle = dnh;
dn->dn_phys = dnp;
- if (dnp->dn_datablkszsec)
+ if (dnp->dn_datablkszsec) {
dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ } else {
+ dn->dn_datablksz = 0;
+ dn->dn_datablkszsec = 0;
+ dn->dn_datablkshift = 0;
+ }
dn->dn_indblkshift = dnp->dn_indblkshift;
dn->dn_nlevels = dnp->dn_nlevels;
dn->dn_type = dnp->dn_type;
@@ -325,45 +408,65 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
mutex_enter(&os->os_lock);
list_insert_head(&os->os_dnodes, dn);
+ membar_producer();
+ /*
+ * Everything else must be valid before assigning dn_objset makes the
+ * dnode eligible for dnode_move().
+ */
+ dn->dn_objset = os;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
return (dn);
}
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
static void
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
-#ifdef ZFS_DEBUG
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
- ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
- }
- ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
+ POINTER_INVALIDATE(&dn->dn_objset);
list_remove(&os->os_dnodes, dn);
mutex_exit(&os->os_lock);
- if (dn->dn_dirtyctx_firstset) {
+ /* the dnode can no longer move, so we can release the handle */
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+ dn->dn_allocated_txg = 0;
+ dn->dn_free_txg = 0;
+ dn->dn_assigned_txg = 0;
+
+ dn->dn_dirtyctx = 0;
+ if (dn->dn_dirtyctx_firstset != NULL) {
kmem_free(dn->dn_dirtyctx_firstset, 1);
dn->dn_dirtyctx_firstset = NULL;
}
- dmu_zfetch_rele(&dn->dn_zfetch);
- if (dn->dn_bonus) {
+ if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
}
+ dn->dn_zio = NULL;
+
+ dn->dn_have_spill = B_FALSE;
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ dn->dn_olduid = 0;
+ dn->dn_oldgid = 0;
+ dn->dn_newuid = 0;
+ dn->dn_newgid = 0;
+ dn->dn_id_flags = 0;
+
+ dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
}
@@ -408,6 +511,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
@@ -522,9 +626,304 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
+#ifdef DNODE_STATS
+static struct {
+ uint64_t dms_dnode_invalid;
+ uint64_t dms_dnode_recheck1;
+ uint64_t dms_dnode_recheck2;
+ uint64_t dms_dnode_special;
+ uint64_t dms_dnode_handle;
+ uint64_t dms_dnode_rwlock;
+ uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+ int i;
+
+ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+ /* Copy fields. */
+ ndn->dn_objset = odn->dn_objset;
+ ndn->dn_object = odn->dn_object;
+ ndn->dn_dbuf = odn->dn_dbuf;
+ ndn->dn_handle = odn->dn_handle;
+ ndn->dn_phys = odn->dn_phys;
+ ndn->dn_type = odn->dn_type;
+ ndn->dn_bonuslen = odn->dn_bonuslen;
+ ndn->dn_bonustype = odn->dn_bonustype;
+ ndn->dn_nblkptr = odn->dn_nblkptr;
+ ndn->dn_checksum = odn->dn_checksum;
+ ndn->dn_compress = odn->dn_compress;
+ ndn->dn_nlevels = odn->dn_nlevels;
+ ndn->dn_indblkshift = odn->dn_indblkshift;
+ ndn->dn_datablkshift = odn->dn_datablkshift;
+ ndn->dn_datablkszsec = odn->dn_datablkszsec;
+ ndn->dn_datablksz = odn->dn_datablksz;
+ ndn->dn_maxblkid = odn->dn_maxblkid;
+ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ sizeof (odn->dn_next_nblkptr));
+ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ sizeof (odn->dn_next_nlevels));
+ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ sizeof (odn->dn_next_indblkshift));
+ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ sizeof (odn->dn_next_bonustype));
+ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ sizeof (odn->dn_rm_spillblk));
+ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ sizeof (odn->dn_next_bonuslen));
+ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ sizeof (odn->dn_next_blksz));
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_move_tail(&ndn->dn_dirty_records[i],
+ &odn->dn_dirty_records[i]);
+ }
+ bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ ndn->dn_allocated_txg = odn->dn_allocated_txg;
+ ndn->dn_free_txg = odn->dn_free_txg;
+ ndn->dn_assigned_txg = odn->dn_assigned_txg;
+ ndn->dn_dirtyctx = odn->dn_dirtyctx;
+ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+ ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+ refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+ ASSERT(list_is_empty(&ndn->dn_dbufs));
+ list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_bonus = odn->dn_bonus;
+ ndn->dn_have_spill = odn->dn_have_spill;
+ ndn->dn_zio = odn->dn_zio;
+ ndn->dn_oldused = odn->dn_oldused;
+ ndn->dn_oldflags = odn->dn_oldflags;
+ ndn->dn_olduid = odn->dn_olduid;
+ ndn->dn_oldgid = odn->dn_oldgid;
+ ndn->dn_newuid = odn->dn_newuid;
+ ndn->dn_newgid = odn->dn_newgid;
+ ndn->dn_id_flags = odn->dn_id_flags;
+ dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+ ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+ ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+ /*
+ * Update back pointers. Updating the handle fixes the back pointer of
+ * every descendant dbuf as well as the bonus dbuf.
+ */
+ ASSERT(ndn->dn_handle->dnh_dnode == odn);
+ ndn->dn_handle->dnh_dnode = ndn;
+ if (ndn->dn_zfetch.zf_dnode == odn) {
+ ndn->dn_zfetch.zf_dnode = ndn;
+ }
+
+ /*
+ * Invalidate the original dnode by clearing all of its back pointers.
+ */
+ odn->dn_dbuf = NULL;
+ odn->dn_handle = NULL;
+ list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+ odn->dn_dbufs_count = 0;
+ odn->dn_bonus = NULL;
+ odn->dn_zfetch.zf_dnode = NULL;
+
+ /*
+ * Set the low bit of the objset pointer to ensure that dnode_move()
+ * recognizes the dnode as invalid in any subsequent callback.
+ */
+ POINTER_INVALIDATE(&odn->dn_objset);
+
+ /*
+ * Satisfy the destructor.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&odn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ odn->dn_ranges[i].avl_root = NULL;
+ odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_next_nlevels[i] = 0;
+ odn->dn_next_indblkshift[i] = 0;
+ odn->dn_next_bonustype[i] = 0;
+ odn->dn_rm_spillblk[i] = 0;
+ odn->dn_next_bonuslen[i] = 0;
+ odn->dn_next_blksz[i] = 0;
+ }
+ odn->dn_allocated_txg = 0;
+ odn->dn_free_txg = 0;
+ odn->dn_assigned_txg = 0;
+ odn->dn_dirtyctx = 0;
+ odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_have_spill = B_FALSE;
+ odn->dn_zio = NULL;
+ odn->dn_oldused = 0;
+ odn->dn_oldflags = 0;
+ odn->dn_olduid = 0;
+ odn->dn_oldgid = 0;
+ odn->dn_newuid = 0;
+ odn->dn_newgid = 0;
+ odn->dn_id_flags = 0;
+
+ /*
+ * Mark the dnode.
+ */
+ ndn->dn_moved = 1;
+ odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef _KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+ dnode_t *odn = buf, *ndn = newbuf;
+ objset_t *os;
+ int64_t refcount;
+ uint32_t dbufs;
+
+ /*
+ * The dnode is on the objset's list of known dnodes if the objset
+ * pointer is valid. We set the low bit of the objset pointer when
+ * freeing the dnode to invalidate it, and the memory patterns written
+ * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+ * A newly created dnode sets the objset pointer last of all to indicate
+ * that the dnode is known and in a valid state to be moved by this
+ * function.
+ */
+ os = odn->dn_objset;
+ if (!POINTER_IS_VALID(os)) {
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * Ensure that the objset does not go away during the move.
+ */
+ rw_enter(&os_lock, RW_WRITER);
+ if (os != odn->dn_objset) {
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * If the dnode is still valid, then so is the objset. We know that no
+ * valid objset can be freed while we hold os_lock, so we can safely
+ * ensure that the objset remains in use.
+ */
+ mutex_enter(&os->os_lock);
+
+ /*
+ * Recheck the objset pointer in case the dnode was removed just before
+ * acquiring the lock.
+ */
+ if (os != odn->dn_objset) {
+ mutex_exit(&os->os_lock);
+ rw_exit(&os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ return (KMEM_CBRC_DONT_KNOW);
+ }
+
+ /*
+ * At this point we know that as long as we hold os->os_lock, the dnode
+ * cannot be freed and fields within the dnode can be safely accessed.
+ * The objset listing this dnode cannot go away as long as this dnode is
+ * on its list.
+ */
+ rw_exit(&os_lock);
+ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ return (KMEM_CBRC_NO);
+ }
+ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+ /*
+ * Lock the dnode handle to prevent the dnode from obtaining any new
+ * holds. This also prevents the descendant dbufs and the bonus dbuf
+ * from accessing the dnode, so that we can discount their holds. The
+ * handle is safe to access because we know that while the dnode cannot
+ * go away, neither can its handle. Once we hold dnh_zrlock, we can
+ * safely move any dnode referenced only by dbufs.
+ */
+ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+ * We need to guarantee that there is a hold for every dbuf in order to
+ * determine whether the dnode is actively referenced. Falsely matching
+ * a dbuf to an active hold would lead to an unsafe move. It's possible
+ * that a thread already having an active dnode hold is about to add a
+ * dbuf, and we can't compare hold and dbuf counts while the add is in
+ * progress.
+ */
+ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ return (KMEM_CBRC_LATER);
+ }
+
+ /*
+ * A dbuf may be removed (evicted) without an active dnode hold. In that
+ * case, the dbuf count is decremented under the handle lock before the
+ * dbuf's hold is released. This order ensures that if we count the hold
+ * after the dbuf is removed but before its hold is released, we will
+ * treat the unmatched hold as active and exit safely. If we count the
+ * hold before the dbuf is removed, the hold is discounted, and the
+ * removal is blocked until the move completes.
+ */
+ refcount = refcount_count(&odn->dn_holds);
+ ASSERT(refcount >= 0);
+ dbufs = odn->dn_dbufs_count;
+
+ /* We can't have more dbufs than dnode holds. */
+ ASSERT3U(dbufs, <=, refcount);
+ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+ uint32_t, dbufs);
+
+ if (refcount > dbufs) {
+ rw_exit(&odn->dn_struct_rwlock);
+ zrl_exit(&odn->dn_handle->dnh_zrlock);
+ mutex_exit(&os->os_lock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ return (KMEM_CBRC_LATER);
+ }
+
+ rw_exit(&odn->dn_struct_rwlock);
+
+ /*
+ * At this point we know that anyone with a hold on the dnode is not
+ * actively referencing it. The dnode is known and in a valid state to
+ * move. We're holding the locks needed to execute the critical section.
+ */
+ dnode_move_impl(odn, ndn);
+
+ list_link_replace(&odn->dn_link, &ndn->dn_link);
+ /* If the dnode was safe to move, the refcount cannot have changed. */
+ ASSERT(refcount == refcount_count(&ndn->dn_holds));
+ ASSERT(dbufs == ndn->dn_dbufs_count);
+ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+ mutex_exit(&os->os_lock);
+
+ return (KMEM_CBRC_YES);
+}
+#endif /* _KERNEL */
+
void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
{
+ dnode_t *dn = dnh->dnh_dnode;
+
/*
* Wait for final references to the dnode to clear. This can
* only happen if the arc is asyncronously evicting state that
@@ -533,13 +932,19 @@ dnode_special_close(dnode_t *dn)
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
dnode_t *
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+ dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
+ dnh->dnh_dnode = dn;
+ zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
return (dn);
}
@@ -547,34 +952,43 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
static void
dnode_buf_pageout(dmu_buf_t *db, void *arg)
{
- dnode_t **children_dnodes = arg;
+ dnode_children_t *children_dnodes = arg;
int i;
int epb = db->db_size >> DNODE_SHIFT;
+ ASSERT(epb == children_dnodes->dnc_count);
+
for (i = 0; i < epb; i++) {
- dnode_t *dn = children_dnodes[i];
- int n;
+ dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ dnode_t *dn;
- if (dn == NULL)
+ /*
+ * The dnode handle lock guards against the dnode moving to
+ * another valid address, so there is no need here to guard
+ * against changes to or from NULL.
+ */
+ if (dnh->dnh_dnode == NULL) {
+ zrl_destroy(&dnh->dnh_zrlock);
continue;
-#ifdef ZFS_DEBUG
+ }
+
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
/*
* If there are holds on this dnode, then there should
* be holds on the dnode's containing dbuf as well; thus
- * it wouldn't be eligable for eviction and this function
+ * it wouldn't be eligible for eviction and this function
* would not have been called.
*/
ASSERT(refcount_is_zero(&dn->dn_holds));
- ASSERT(list_head(&dn->dn_dbufs) == NULL);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- for (n = 0; n < TXG_SIZE; n++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
- children_dnodes[i] = NULL;
- dnode_destroy(dn);
+ dnode_destroy(dn); /* implicit zrl_remove() */
+ zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = NULL;
}
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
}
/*
@@ -593,7 +1007,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_t **children_dnodes;
+ dnode_children_t *children_dnodes;
+ dnode_handle_t *dnh;
/*
* If you are holding the spa config lock as writer, you shouldn't
@@ -607,7 +1022,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
- os->os_userused_dnode : os->os_groupused_dnode;
+ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
return (ENOENT);
type = dn->dn_type;
@@ -624,7 +1039,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL);
- mdn = os->os_meta_dnode;
+ mdn = DMU_META_DNODE(os);
+ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
DNODE_VERIFY(mdn);
@@ -651,26 +1067,39 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
idx = object & (epb-1);
+ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- dnode_t **winner;
- children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
- KM_SLEEP);
+ int i;
+ dnode_children_t *winner;
+ children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ children_dnodes->dnc_count = epb;
+ dnh = &children_dnodes->dnc_children[0];
+ for (i = 0; i < epb; i++) {
+ zrl_init(&dnh[i].dnh_zrlock);
+ dnh[i].dnh_dnode = NULL;
+ }
if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
dnode_buf_pageout)) {
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ (epb - 1) * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
+ ASSERT(children_dnodes->dnc_count == epb);
- if ((dn = children_dnodes[idx]) == NULL) {
- dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ if ((dn = dnh->dnh_dnode) == NULL) {
+ dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
dnode_t *winner;
- dn = dnode_create(os, dnp, db, object);
- winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ dn = dnode_create(os, phys, db, object, dnh);
+ winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
if (winner != NULL) {
- dnode_destroy(dn);
+ zrl_add(&dnh->dnh_zrlock);
+ dnode_destroy(dn); /* implicit zrl_remove() */
dn = winner;
}
}
@@ -682,13 +1111,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
((flag & DNODE_MUST_BE_FREE) &&
(type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
+ zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dn);
+ dbuf_add_ref(db, dnh);
+ /* Now we can rely on the hold to prevent the dnode from moving. */
+ zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
@@ -730,13 +1162,37 @@ void
dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ /* Get while the hold prevents the dnode from moving. */
+ dmu_buf_impl_t *db = dn->dn_dbuf;
+ dnode_handle_t *dnh = dn->dn_handle;
mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
+
+ /*
+ * It's unsafe to release the last hold on a dnode by dnode_rele() or
+ * indirectly by dbuf_rele() while relying on the dnode handle to
+ * prevent the dnode from moving, since releasing the last hold could
+ * result in the dnode's parent dbuf evicting its dnode handles. For
+ * that reason anyone calling dnode_rele() or dbuf_rele() without some
+ * other direct or indirect hold on the dnode must first drop the dnode
+ * handle.
+ */
+ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
- if (refs == 0 && dn->dn_dbuf)
- dbuf_rele(dn->dn_dbuf, dn);
+ if (refs == 0 && db != NULL) {
+ /*
+ * Another thread could add a hold to the dnode handle in
+ * dnode_hold_impl() while holding the parent dbuf. Since the
+ * hold on the parent dbuf prevents the handle from being
+ * destroyed, the hold on the handle is OK. We can't yet assert
+ * that the handle has zero references, but that will be
+ * asserted anyway when the handle gets destroyed.
+ */
+ dbuf_rele(db, dnh);
+ }
}
void
@@ -755,7 +1211,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
#ifdef ZFS_DEBUG
mutex_enter(&dn->dn_mtx);
ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
- /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
mutex_exit(&dn->dn_mtx);
#endif
@@ -794,7 +1250,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
/*
* The dnode maintains a hold on its containing dbuf as
* long as there are holds on it. Each instantiated child
- * dbuf maintaines a hold on the dnode. When the last child
+ * dbuf maintains a hold on the dnode. When the last child
* drops its hold, the dnode will drop its hold on the
* containing dbuf. We add a "dirty hold" here so that the
* dnode will hang around after we finish processing its
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index f9ec9f6023..2ee990a3b3 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -76,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
if (child == NULL)
continue;
- ASSERT3P(child->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(child);
+ ASSERT3P(DB_DNODE(child), ==, dn);
+ DB_DNODE_EXIT(child);
+#endif /* DEBUG */
if (child->db_parent && child->db_parent != dn->dn_dbuf) {
ASSERT(child->db_parent->db_level == db->db_level);
ASSERT(child->db_blkptr !=
@@ -135,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
int off, num;
int i, err, epbs;
uint64_t txg = tx->tx_txg;
+ dnode_t *dn;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
off = start - (db->db_blkid * 1<<epbs);
num = end - start + 1;
ASSERT3U(off, >=, 0);
ASSERT3U(num, >=, 0);
ASSERT3U(db->db_level, >, 0);
- ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
ASSERT(db->db_blkptr != NULL);
@@ -155,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
+ rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
ASSERT(err == 0);
@@ -200,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
dbuf_rele(child, FTAG);
}
+ DB_DNODE_EXIT(db);
}
#endif
@@ -209,7 +217,7 @@ static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
{
- dnode_t *dn = db->db_dnode;
+ dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
@@ -230,7 +238,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
shift = (db->db_level - 1) * epbs;
dbstart = db->db_blkid << epbs;
start = blkid >> shift;
@@ -253,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
@@ -272,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
}
dbuf_rele(subdb, FTAG);
}
+ DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
@@ -375,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn)
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
- ASSERT3P(db->db_dnode, ==, dn);
+#ifdef DEBUG
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
+#endif /* DEBUG */
mutex_enter(&db->db_mtx);
if (db->db_state == DB_EVICTING) {
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
index 8358b4ceeb..600132f080 100644
--- a/usr/src/uts/common/fs/zfs/refcount.c
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -25,7 +25,7 @@
#include <sys/zfs_context.h>
#include <sys/refcount.h>
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
#ifdef _KERNEL
int reference_tracking_enable = FALSE; /* runs out of memory too easily */
@@ -189,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder)
return (refcount_remove_many(rc, 1, holder));
}
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+ int64_t count, removed_count;
+ list_t list, removed;
+
+ list_create(&list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+
+ mutex_enter(&src->rc_mtx);
+ count = src->rc_count;
+ removed_count = src->rc_removed_count;
+ src->rc_count = 0;
+ src->rc_removed_count = 0;
+ list_move_tail(&list, &src->rc_list);
+ list_move_tail(&removed, &src->rc_removed);
+ mutex_exit(&src->rc_mtx);
+
+ mutex_enter(&dst->rc_mtx);
+ dst->rc_count += count;
+ dst->rc_removed_count += removed_count;
+ list_move_tail(&dst->rc_list, &list);
+ list_move_tail(&dst->rc_removed, &removed);
+ mutex_exit(&dst->rc_mtx);
+
+ list_destroy(&list);
+ list_destroy(&removed);
+}
+
+#endif /* ZFS_DEBUG */
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 403fd86a35..4cb4546b25 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -1612,6 +1612,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
uint16_t buflen, dmu_tx_t *tx)
{
sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+ dnode_t *dn;
sa_bulk_attr_t *attr_desc;
void *old_data[2];
int bonus_attr_count = 0;
@@ -1629,7 +1631,9 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
/* First make of copy of the old data */
- if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dn->dn_bonuslen != 0) {
bonus_data_size = hdl->sa_bonus->db_size;
old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
bcopy(hdl->sa_bonus->db_data, old_data[0],
@@ -1638,6 +1642,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
} else {
old_data[0] = NULL;
}
+ DB_DNODE_EXIT(db);
/* Bring spill buffer online if it isn't currently */
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 4c05806e3e..cf1bbc030f 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -32,6 +32,7 @@
#include <sys/arc.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -82,9 +83,6 @@ struct dmu_tx;
* etc.
*/
-#define LIST_LINK_INACTIVE(link) \
- ((link)->list_next == NULL && (link)->list_prev == NULL)
-
struct dmu_buf_impl;
typedef enum override_states {
@@ -149,15 +147,17 @@ typedef struct dmu_buf_impl {
struct objset *db_objset;
/*
- * the dnode we belong to (NULL when evicted)
+ * handle to safely access the dnode we belong to (NULL when evicted)
*/
- struct dnode *db_dnode;
+ struct dnode_handle *db_dnode_handle;
/*
* our parent buffer; if the dnode points to us directly,
- * db_parent == db_dnode->dn_dbuf
+ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
* only accessed by sync thread ???
* (NULL when evicted)
+ * May change from NULL to non-NULL under the protection of db_mtx
+ * (see dbuf_check_blkptr())
*/
struct dmu_buf_impl *db_parent;
@@ -284,24 +284,46 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
+#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
+#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
+#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+#define DB_GET_SPA(_spa_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_spa_p) = __dn->dn_objset->os_spa; \
+ DB_DNODE_EXIT(_db); \
+}
+#define DB_GET_OBJSET(_os_p, _db) { \
+ dnode_t *__dn; \
+ DB_DNODE_ENTER(_db); \
+ __dn = DB_DNODE(_db); \
+ *(_os_p) = __dn->dn_objset; \
+ DB_DNODE_EXIT(_db); \
+}
+
void dbuf_init(void);
void dbuf_fini(void);
-#define DBUF_IS_METADATA(db) \
- ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define DBUF_IS_METADATA(_db) \
+ (dbuf_is_metadata(_db))
-#define DBUF_GET_BUFC_TYPE(db) \
- (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define DBUF_GET_BUFC_TYPE(_db) \
+ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-#define DBUF_IS_CACHEABLE(db) \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_CACHEABLE(_db) \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-#define DBUF_IS_L2CACHEABLE(db) \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
- (DBUF_IS_METADATA(db) && \
- ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define DBUF_IS_L2CACHEABLE(_db) \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (DBUF_IS_METADATA(_db) && \
+ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
@@ -332,7 +354,7 @@ _NOTE(CONSTCOND) } while (0)
sprintf_blkptr(__blkbuf, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
+ } \
_NOTE(CONSTCOND) } while (0)
#define DBUF_VERIFY(db) dbuf_verify(db)
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index b6061cfffc..c504c23310 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -335,6 +335,7 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 5c5119a207..12d5c4ddd4 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -40,6 +40,8 @@
extern "C" {
#endif
+extern krwlock_t os_lock;
+
struct dsl_dataset;
struct dmu_tx;
@@ -68,9 +70,15 @@ struct objset {
spa_t *os_spa;
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
- dnode_t *os_meta_dnode;
- dnode_t *os_userused_dnode;
- dnode_t *os_groupused_dnode;
+ /*
+ * The following "special" dnodes have no parent and are exempt from
+ * dnode_move(), but they root their descendents in this objset using
+ * handles anyway, so that all access to dnodes from dbufs consistently
+ * uses handles.
+ */
+ dnode_handle_t os_meta_dnode;
+ dnode_handle_t os_userused_dnode;
+ dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
/* can change, under dsl_dir's locks: */
@@ -113,6 +121,9 @@ struct objset {
#define DMU_META_OBJSET 0
#define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
+#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
+#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)
#define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
@@ -161,6 +172,9 @@ boolean_t dmu_objset_userused_enabled(objset_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 8bae1602e7..9ad4be36bf 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -32,6 +32,7 @@
#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
#ifdef __cplusplus
extern "C" {
@@ -156,6 +157,7 @@ typedef struct dnode {
struct objset *dn_objset;
uint64_t dn_object;
struct dmu_buf_impl *dn_dbuf;
+ struct dnode_handle *dn_handle;
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
@@ -172,6 +174,7 @@ typedef struct dnode {
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint8_t dn_moved; /* Has this dnode been moved? */
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
@@ -183,6 +186,9 @@ typedef struct dnode {
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+ uint32_t dn_dbufs_count; /* count of dn_dbufs */
+
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
@@ -202,8 +208,11 @@ typedef struct dnode {
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ list_t dn_dbufs; /* descendent dbufs */
+
+ /* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
boolean_t dn_have_spill; /* have spill or are spilling */
/* parent IO for current sync write */
@@ -220,6 +229,22 @@ typedef struct dnode {
struct zfetch dn_zfetch;
} dnode_t;
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+ /* Protects dnh_dnode from modification by dnode_move(). */
+ zrlock_t dnh_zrlock;
+ dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+ size_t dnc_count; /* number of children */
+ dnode_handle_t dnc_children[1]; /* sized dynamically */
+} dnode_children_t;
+
typedef struct free_range {
avl_node_t fr_node;
uint64_t fr_blkid;
@@ -227,8 +252,8 @@ typedef struct free_range {
} free_range_t;
dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
- uint64_t object);
-void dnode_special_close(dnode_t *dn);
+ uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index bc3ade80f1..1752c64e3e 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -40,7 +40,7 @@ extern "C" {
*/
#define FTAG ((char *)__func__)
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef ZFS_DEBUG
typedef struct reference {
list_node_t ref_link;
void *ref_holder;
@@ -67,11 +67,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag);
int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_init(void);
void refcount_fini(void);
-#else /* DEBUG */
+#else /* ZFS_DEBUG */
typedef struct refcount {
uint64_t rc_count;
@@ -97,7 +98,7 @@ typedef struct refcount {
#define refcount_init()
#define refcount_fini()
-#endif /* DEBUG */
+#endif /* ZFS_DEBUG */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h
index 62497e7025..6661e47cfc 100644
--- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SA_IMPL_H
@@ -232,7 +231,7 @@ struct sa_handle {
((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
#define SA_BONUSTYPE_FROM_DB(db) \
- (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+ (dmu_get_bonustype((dmu_buf_t *)db))
#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index 4781ee6862..e5257b89e5 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_ZNODE_H
@@ -188,6 +187,7 @@ typedef struct znode {
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint8_t z_moved; /* Has this znode been moved? */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
diff --git a/usr/src/uts/common/fs/zfs/sys/zrlock.h b/usr/src/uts/common/fs/zfs/sys/zrlock.h
new file mode 100644
index 0000000000..dcd63f7b5b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zrlock.h
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_ZRLOCK_H
+#define _SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+ kmutex_t zr_mtx;
+ volatile int32_t zr_refcount;
+ kcondvar_t zr_cv;
+ uint16_t zr_pad;
+#ifdef ZFS_DEBUG
+ kthread_t *zr_owner;
+ const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#ifdef ZFS_DEBUG
+#define zrl_add(_z) zrl_add_debug((_z), __func__)
+extern void zrl_add_debug(zrlock_t *, const char *);
+#else
+extern void zrl_add(zrlock_t *);
+#endif
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 3c04360a13..f6558f60a5 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -81,9 +81,6 @@
#define ZNODE_STAT_ADD(stat) /* nothing */
#endif /* ZNODE_STATS */
-#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
-#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
/*
* Functions needed for userland (ie: libzpool) are not put under
* #ifdef_KERNEL; the rest of the functions have dependencies
@@ -136,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
+ zp->z_moved = 0;
return (0);
}
@@ -228,6 +226,12 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
*/
ozp->z_sa_hdl = NULL;
POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+ /*
+ * Mark the znode.
+ */
+ nzp->z_moved = 1;
+ ozp->z_moved = (uint8_t)-1;
}
/*ARGSUSED*/
@@ -478,6 +482,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
vattr.va_gid = crgetgid(kcred);
sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+ sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
@@ -627,6 +633,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
ASSERT(zp->z_dirlocks == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+ zp->z_moved = 0;
/*
* Defer setting z_zfsvfs until the znode is ready to be a candidate for
@@ -759,7 +766,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
{
uint64_t crtime[2], atime[2], mtime[2], ctime[2];
uint64_t mode, size, links, parent, pflags;
- uint64_t dzp_pflags = 0;
+ uint64_t dzp_pflags = 0;
uint64_t rdev = 0;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
dmu_buf_t *db;
@@ -794,7 +801,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
/*
* There's currently no mechanism for pre-reading the blocks that will
- * be to needed allocate a new object, so we accept the small chance
+ * be needed to allocate a new object, so we accept the small chance
* that there will be an i/o error and we will fail one of the
* assertions below.
*/
@@ -1807,6 +1814,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vattr.va_gid = crgetgid(cr);
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+ rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
rootzp->z_is_sa = USE_SA(version, os);
@@ -1843,7 +1852,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
diff --git a/usr/src/uts/common/fs/zfs/zrlock.c b/usr/src/uts/common/fs/zfs/zrlock.c
new file mode 100644
index 0000000000..ec94b08555
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zrlock.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define ZRL_LOCKED ((uint32_t)-1)
+#define ZRL_DESTROYED -2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ zrl->zr_refcount = 0;
+ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == 0);
+
+ mutex_destroy(&zrl->zr_mtx);
+ zrl->zr_refcount = ZRL_DESTROYED;
+ cv_destroy(&zrl->zr_cv);
+}
+
+void
+#ifdef ZFS_DEBUG
+zrl_add_debug(zrlock_t *zrl, const char *zc)
+#else
+zrl_add(zrlock_t *zrl)
+#endif
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ return;
+ }
+ n = cas;
+ }
+
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ ASSERT(zrl->zr_refcount >= 0);
+ zrl->zr_refcount++;
+#ifdef ZFS_DEBUG
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
+#endif
+ mutex_exit(&zrl->zr_mtx);
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+ uint32_t n;
+
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT((int32_t)n >= 0);
+#ifdef ZFS_DEBUG
+ if (zrl->zr_owner == curthread) {
+ zrl->zr_owner = NULL;
+ zrl->zr_caller = NULL;
+ }
+#endif
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+
+ if (n == 0) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+ if (cas == 0) {
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == NULL);
+ zrl->zr_owner = curthread;
+#endif
+ return (1);
+ }
+ }
+
+ ASSERT((int32_t)n > ZRL_DESTROYED);
+
+ return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+
+ mutex_enter(&zrl->zr_mtx);
+#ifdef ZFS_DEBUG
+ ASSERT(zrl->zr_owner == curthread);
+ zrl->zr_owner = NULL;
+ membar_producer(); /* make sure the owner store happens first */
+#endif
+ zrl->zr_refcount = 0;
+ cv_broadcast(&zrl->zr_cv);
+ mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ int n = (int)zrl->zr_refcount;
+ return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+ ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+
+ return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+ return (zrl->zr_owner);
+}
+#endif
diff --git a/usr/src/uts/common/sys/dnlc.h b/usr/src/uts/common/sys/dnlc.h
index c58de9c011..070506ee31 100644
--- a/usr/src/uts/common/sys/dnlc.h
+++ b/usr/src/uts/common/sys/dnlc.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -39,8 +38,6 @@
#ifndef _SYS_DNLC_H
#define _SYS_DNLC_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -163,7 +160,8 @@ struct nc_stats {
*/
#define DNLCHASH(name, dvp, hash, namlen) \
{ \
- char Xc, *Xcp; \
+ char Xc; \
+ const char *Xcp; \
hash = (int)((uintptr_t)(dvp)) >> 8; \
for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
(hash) = ((hash) << 4) + (hash) + Xc; \
@@ -181,13 +179,13 @@ extern vnode_t negative_cache_vnode;
#define DNLC_NO_VNODE &negative_cache_vnode
void dnlc_init(void);
-void dnlc_enter(vnode_t *, char *, vnode_t *);
-void dnlc_update(vnode_t *, char *, vnode_t *);
-vnode_t *dnlc_lookup(vnode_t *, char *);
+void dnlc_enter(vnode_t *, const char *, vnode_t *);
+void dnlc_update(vnode_t *, const char *, vnode_t *);
+vnode_t *dnlc_lookup(vnode_t *, const char *);
void dnlc_purge(void);
void dnlc_purge_vp(vnode_t *);
int dnlc_purge_vfsp(vfs_t *, int);
-void dnlc_remove(vnode_t *, char *);
+void dnlc_remove(vnode_t *, const char *);
int dnlc_fs_purge1(struct vnodeops *);
vnode_t *dnlc_reverse_lookup(vnode_t *, char *, size_t);
void dnlc_reduce_cache(void *);
@@ -296,7 +294,7 @@ dcret_t dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries);
* For example, "handle" for ufs holds the inumber and a directory
* entry offset. Returns DOK, DNOCACHE, DTOOBIG.
*/
-dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle);
+dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle);
/*
* dnlc_dir_add_space adds free space (length and file system specific
@@ -322,21 +320,22 @@ void dnlc_dir_purge(dcanchor_t *dcap);
* and returns the file system handle specified on dnlc_dir_add_entry()
* in "handlep". Returns DFOUND, DNOENT, DNOCACHE.
*/
-dcret_t dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handlep);
+dcret_t dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handlep);
/*
* dnlc_dir_update() amends the handle for an entry in a directory cache
* "handle" is the new file system specific handle for the file "name".
* Returns DFOUND, DNOENT, DNOCACHE.
*/
-dcret_t dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle);
+dcret_t dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle);
/*
* dnlc_dir_rem_entry() removes an entry form a directory cache.
* Returns the handle if "handlep" non null.
* Returns DFOUND, DNOENT, DNOCACHE.
*/
-dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep);
+dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name,
+ uint64_t *handlep);
/*
* dnlc_dir_rem_space_by_len() looks up and returns free space in a
diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h
index 3a37f63fa2..03d4d24ba3 100644
--- a/usr/src/uts/common/sys/kmem.h
+++ b/usr/src/uts/common/sys/kmem.h
@@ -95,6 +95,15 @@ typedef enum kmem_cbrc {
#ifdef _KERNEL
+/*
+ * Helps clients implementing the move() callback to recognize known objects by
+ * testing a client-designated pointer member. Takes advantage of the fact that
+ * any scribbling to freed memory done by kmem is guaranteed to set one of the
+ * two low order bits.
+ */
+#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3))
+#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
+
extern int kmem_ready;
extern pgcnt_t kmem_reapahead;
diff --git a/usr/src/uts/intel/io/dktp/dcdev/dadk.c b/usr/src/uts/intel/io/dktp/dcdev/dadk.c
index 12bf34b65d..3fd4477fd1 100644
--- a/usr/src/uts/intel/io/dktp/dcdev/dadk.c
+++ b/usr/src/uts/intel/io/dktp/dcdev/dadk.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -868,11 +867,7 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag,
sizeof (struct dk_callback), KM_SLEEP);
bcopy(dkc, dkc2, sizeof (*dkc2));
- /*
- * Borrow b_list to carry private data
- * to the b_iodone func.
- */
- bp->b_list = (struct buf *)dkc2;
+ bp->b_private = dkc2;
bp->b_iodone = dadk_flushdone;
is_sync = 0;
}
@@ -988,7 +983,7 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag,
int
dadk_flushdone(struct buf *bp)
{
- struct dk_callback *dkc = (struct dk_callback *)bp->b_list;
+ struct dk_callback *dkc = bp->b_private;
ASSERT(dkc != NULL && dkc->dkc_callback != NULL);
diff --git a/usr/src/uts/sun/io/dada/targets/dad.c b/usr/src/uts/sun/io/dada/targets/dad.c
index 1d71904da5..b3be5f3ea5 100644
--- a/usr/src/uts/sun/io/dada/targets/dad.c
+++ b/usr/src/uts/sun/io/dada/targets/dad.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
@@ -3465,6 +3464,7 @@ dcdioctl(dev_t dev, int cmd, intptr_t arg, int flag,
bp->b_un.b_addr = 0;
bp->b_iodone = NULL;
bp->b_list = NULL;
+ bp->b_private = NULL;
if ((flag & FKIOCTL) && dkc != NULL &&
dkc->dkc_callback != NULL) {
@@ -3472,7 +3472,7 @@ dcdioctl(dev_t dev, int cmd, intptr_t arg, int flag,
kmem_zalloc(sizeof (*dkc2), KM_SLEEP);
bcopy(dkc, dkc2, sizeof (*dkc2));
- bp->b_list = (struct buf *)dkc2;
+ bp->b_private = dkc2;
bp->b_iodone = dcdflushdone;
is_sync = 0;
}
@@ -3500,7 +3500,7 @@ dcdflushdone(struct buf *bp)
struct dcd_disk *un = ddi_get_soft_state(dcd_state,
DCDUNIT(bp->b_edev));
struct dcd_pkt *pkt = BP_PKT(bp);
- struct dk_callback *dkc = (struct dk_callback *)bp->b_list;
+ struct dk_callback *dkc = bp->b_private;
ASSERT(un != NULL);
ASSERT(bp == un->un_sbufp);
@@ -3514,7 +3514,7 @@ dcdflushdone(struct buf *bp)
(*dkc->dkc_callback)(dkc->dkc_cookie, geterror(bp));
kmem_free(dkc, sizeof (*dkc));
bp->b_iodone = NULL;
- bp->b_list = NULL;
+ bp->b_private = NULL;
}
/*