diff options
Diffstat (limited to 'usr/src')
29 files changed, 1491 insertions, 321 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 31681b2d5a..56686fe30a 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -1459,7 +1459,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } if (object == 0) { - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) @@ -1467,7 +1467,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) object, error); bonus = db->db_data; bsize = db->db_size; - dn = ((dmu_buf_impl_t *)db)->db_dnode; + dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); @@ -1631,8 +1631,8 @@ dump_dir(objset_t *os) dump_object(os, 0, verbosity, &print_header); object_count = 0; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != 0) { + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); } diff --git a/usr/src/cmd/zinject/translate.c b/usr/src/cmd/zinject/translate.c index cd967a8451..87751e315e 100644 --- a/usr/src/cmd/zinject/translate.c +++ b/usr/src/cmd/zinject/translate.c @@ -267,7 +267,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range, } if (record->zi_object == 0) { - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); } else { err = dnode_hold(os, record->zi_object, FTAG, &dn); if (err != 0) { @@ -318,7 +318,7 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range, ret = 0; out: if (dn) { - if (dn != os->os_meta_dnode) + if (dn != DMU_META_DNODE(os)) dnode_rele(dn, FTAG); } if (os) diff --git a/usr/src/lib/libavl/mapfile-vers b/usr/src/lib/libavl/mapfile-vers index 97433d8c4c..2f5c4641ee 100644 --- a/usr/src/lib/libavl/mapfile-vers +++ b/usr/src/lib/libavl/mapfile-vers @@ -19,8 +19,7 @@ # CDDL HEADER END # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # # @@ -47,6 +46,7 @@ SUNWprivate_1.1 { avl_first; avl_insert; avl_insert_here; + avl_is_empty; avl_last; avl_nearest; avl_numnodes; diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h index 9a6d712e53..51130b86d8 100644 --- a/usr/src/lib/libzpool/common/sys/zfs_context.h +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -231,8 +230,10 @@ typedef struct kmutex { } kmutex_t; #define MUTEX_DEFAULT USYNC_THREAD -#undef MUTEX_HELD +#undef MUTEX_HELD +#undef MUTEX_NOT_HELD #define MUTEX_HELD(m) _mutex_held(&(m)->m_lock) +#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) /* * Argh -- we have to get cheesy here because the kernel and userland @@ -323,10 +324,21 @@ extern void kstat_delete(kstat_t *); #define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) #define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) #define kmem_debugging() 0 -#define kmem_cache_reap_now(c) +#define kmem_cache_reap_now(_c) /* nothing */ +#define kmem_cache_set_move(_c, _cb) /* nothing */ +#define POINTER_INVALIDATE(_pp) /* nothing */ +#define POINTER_IS_VALID(_p) 0 typedef umem_cache_t kmem_cache_t; +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + /* * Task queues */ diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index a3f07f4cc4..ddad8ba466 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1380,7 +1380,8 @@ ZFS_COMMON_OBJS += \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ - zle.o + zle.o \ + zrlock.o ZFS_SHARED_OBJS += \ zfs_namecheck.o \ diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index 0941dfb9ac..b45e3b17cb 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -278,7 +277,8 @@ vnode_t negative_cache_vnode; */ #define DNLC_DIR_HASH(name, hash, namelen) \ { \ - char Xc, *Xcp; \ + char Xc; \ + const char *Xcp; \ hash = *name; \ for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \ hash = (hash << 4) + hash + Xc; \ @@ -322,7 +322,8 @@ static dchead_t dc_head; /* anchor of cached directories */ /* Prototypes */ static ncache_t *dnlc_get(uchar_t namlen); -static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash); +static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, + int hash); static void dnlc_dir_reclaim(void *unused); static void dnlc_dir_abort(dircache_t *dcp); static void dnlc_dir_adjust_fhash(dircache_t *dcp); @@ -431,7 +432,7 @@ dnlc_init() * Add a name to the directory cache. */ void -dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) +dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp) { ncache_t *ncp; nc_hash_t *hp; @@ -497,7 +498,7 @@ dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) * it just frees up the newly allocated dnlc entry. */ void -dnlc_update(vnode_t *dp, char *name, vnode_t *vp) +dnlc_update(vnode_t *dp, const char *name, vnode_t *vp) { ncache_t *ncp; ncache_t *tcp; @@ -579,7 +580,7 @@ dnlc_update(vnode_t *dp, char *name, vnode_t *vp) * lost before the caller can use the vnode. */ vnode_t * -dnlc_lookup(vnode_t *dp, char *name) +dnlc_lookup(vnode_t *dp, const char *name) { ncache_t *ncp; nc_hash_t *hp; @@ -660,7 +661,7 @@ dnlc_lookup(vnode_t *dp, char *name) * Remove an entry in the directory name cache. */ void -dnlc_remove(vnode_t *dp, char *name) +dnlc_remove(vnode_t *dp, const char *name) { ncache_t *ncp; nc_hash_t *hp; @@ -968,7 +969,7 @@ dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) * ncache entry if found, NULL otherwise. */ static ncache_t * -dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash) +dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash) { nc_hash_t *hp; ncache_t *ncp; @@ -1141,7 +1142,7 @@ found: * Lookup up an entry in a complete or partial directory cache. */ dcret_t -dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle) +dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle) { dircache_t *dcp; dcentry_t *dep; @@ -1282,7 +1283,7 @@ error: * Add a directopry entry to a partial or complete directory cache. */ dcret_t -dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle) +dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle) { dircache_t *dcp; dcentry_t **hp, *dep; @@ -1583,7 +1584,7 @@ dnlc_dir_purge(dcanchor_t *dcap) * Return the handle if it's non null. */ dcret_t -dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep) +dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep) { dircache_t *dcp; dcentry_t **prevpp, *te; @@ -1782,7 +1783,7 @@ dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle) * Update the handle of an directory cache entry. */ dcret_t -dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle) +dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle) { dircache_t *dcp; dcentry_t *dep; diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index f3388e1360..b73dbdcfd7 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -217,6 +217,22 @@ dbuf_evict_user(dmu_buf_impl_t *db) db->db_evict_func = NULL; } +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + void dbuf_evict(dmu_buf_impl_t *db) { @@ -281,7 +297,7 @@ dbuf_fini(void) static void dbuf_verify(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -290,6 +306,8 @@ dbuf_verify(dmu_buf_impl_t *db) return; ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); @@ -297,8 +315,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == - DMU_SPILL_BLKID || list_head(&dn->dn_dbufs)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !list_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); @@ -355,7 +374,7 @@ dbuf_verify(dmu_buf_impl_t *db) * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -380,6 +399,7 @@ dbuf_verify(dmu_buf_impl_t *db) } } } + DB_DNODE_EXIT(db); } #endif @@ -424,8 +444,11 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; + spa_t *spa; + mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); + DB_GET_SPA(&spa, db); + abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; @@ -484,11 +507,14 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; arc_buf_t *pbuf; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -506,6 +532,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -524,6 +551,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); + DB_DNODE_EXIT(db); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -531,6 +559,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) return; } + spa = dn->dn_objset->os_spa; + DB_DNODE_EXIT(db); + db->db_state = DB_READ; mutex_exit(&db->db_mtx); @@ -549,7 +580,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) else pbuf = db->db_objset->os_phys_buf; - (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) dsl_read(zio, spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -563,6 +594,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) int err = 0; int havepzio = (zio != NULL); int prefetch; + dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it @@ -573,46 +605,51 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (db->db_state == DB_NOFILL) return (EIO); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { - if (zio == NULL) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - } + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { @@ -642,11 +679,12 @@ dbuf_noread(dmu_buf_impl_t *db) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); + DB_GET_SPA(&spa, db); + dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); @@ -687,7 +725,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, + * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ @@ -700,8 +738,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dr->dt.dl.dr_data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); + spa_t *spa; + + DB_GET_SPA(&spa, db); + dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_set_data(db, NULL); @@ -726,9 +766,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp)) - zio_free(db->db_dnode->dn_objset->os_spa, txg, bp); + if (!BP_IS_HOLE(bp)) { + spa_t *spa; + DB_GET_SPA(&spa, db); + zio_free(spa, txg, bp); + } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* * Release the already-written buffer, so we leave it in @@ -884,11 +927,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dbuf_will_dirty() with the dn_struct_rwlock held @@ -903,7 +950,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -923,15 +970,17 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); - dnode_willuse_space(db->db_dnode, size-osize, tx); + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { - objset_t *os = db->db_dnode->dn_objset; + objset_t *os; zbookmark_t zb; + DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); @@ -949,8 +998,8 @@ dbuf_release_bp(dmu_buf_impl_t *db) dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; @@ -960,6 +1009,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they @@ -1014,6 +1065,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, @@ -1049,6 +1102,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ + os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); @@ -1137,6 +1191,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; @@ -1199,8 +1254,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || - db->db_parent == db->db_dnode->dn_dbuf); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1210,13 +1264,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; @@ -1237,6 +1292,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* * If this buffer is currently held, we cannot undirty * it, since one of the current holders may be in the @@ -1249,6 +1307,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); + DB_DNODE_EXIT(db); return (0); } @@ -1270,6 +1329,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } + DB_DNODE_EXIT(db); if (db->db_level == 0) { if (db->db_state != DB_NOFILL) { @@ -1315,8 +1375,10 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } @@ -1378,7 +1440,6 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); @@ -1442,7 +1503,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_buf_evict() * ARC: dbuf_do_evict()->dbuf_destroy() @@ -1450,9 +1511,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) void dbuf_clear(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb = dn->dn_dbuf; + dmu_buf_impl_t *dndb; int dbuf_gone = FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1476,10 +1537,26 @@ dbuf_clear(dmu_buf_impl_t *db) db->db_state = DB_EVICTING; db->db_blkptr = NULL; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; + } else { + DB_DNODE_EXIT(db); } if (db->db_buf) @@ -1489,7 +1566,7 @@ dbuf_clear(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); /* - * If this dbuf is referened from an indirect dbuf, + * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) @@ -1581,7 +1658,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; - db->db_dnode = dn; + db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; @@ -1638,6 +1715,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); @@ -1677,15 +1755,24 @@ dbuf_destroy(dmu_buf_impl_t *db) * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (db->db_dnode) { - dnode_t *dn = db->db_dnode; + if (db->db_dnode_handle != NULL) { + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); + (void) atomic_dec_32_nv(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); - + DB_DNODE_EXIT(db); + /* + * Decrementing the dbuf count means that the hold + * corresponding to the removed dbuf is no longer + * discounted in dnode_move(), so the dnode cannot be + * moved until after we release the hold. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } @@ -1824,7 +1911,7 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); @@ -1840,7 +1927,7 @@ top: if (parent) dbuf_rele(parent, NULL); - ASSERT3P(db->db_dnode, ==, dn); + ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; @@ -1877,6 +1964,8 @@ int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + if (db->db_blkid != DMU_SPILL_BLKID) return (ENOTSUP); if (blksz == 0) @@ -1886,9 +1975,12 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) else blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); return (0); } @@ -1907,6 +1999,13 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) ASSERT(holds > 1); } +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ #pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) @@ -1927,6 +2026,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); @@ -1944,7 +2048,20 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); - dnode_rele(db->db_dnode, db); + + /* + * If the dnode moves here, we cannot cross this barrier + * until the move completes. + */ + DB_DNODE_ENTER(db); + (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); + DB_DNODE_EXIT(db); + /* + * The bonus buffer's dnode hold is no longer discounted + * in dnode_move(). The dnode cannot move until after + * the dnode_rele(). + */ + dnode_rele(DB_DNODE(db), db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this @@ -2095,7 +2212,7 @@ static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; + dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); @@ -2113,10 +2230,13 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); ASSERT(db->db_buf != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); db->db_data_pending = dr; @@ -2136,8 +2256,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); @@ -2160,6 +2280,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; @@ -2179,6 +2302,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -2197,6 +2322,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) return; } + os = dn->dn_objset; + /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we @@ -2206,7 +2333,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immdiate write, + * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { @@ -2243,10 +2370,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - else + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); + } } void @@ -2280,9 +2417,9 @@ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; @@ -2290,12 +2427,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_blkptr == bp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (BP_IS_HOLE(bp)) { ASSERT(bp->blk_fill == 0); + DB_DNODE_EXIT(db); return; } @@ -2309,7 +2449,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn = db->db_dnode; ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); @@ -2342,6 +2481,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill += ibp->blk_fill; } } + DB_DNODE_EXIT(db); bp->blk_fill = fill; @@ -2355,8 +2495,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_impl_t *db = vdb; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; uint64_t txg = zio->io_txg; dbuf_dirty_record_t **drp, *dr; @@ -2366,8 +2504,13 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; + objset_t *os; + dsl_dataset_t *ds; + dmu_tx_t *tx; + + DB_GET_OBJSET(&os, db); + ds = os->os_dsl_dataset; + tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); @@ -2388,10 +2531,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); } #endif @@ -2406,6 +2553,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -2417,6 +2568,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } + DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -2472,8 +2624,8 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; @@ -2481,6 +2633,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) zio_t *zio; int wp_flag = 0; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* @@ -2525,6 +2681,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 354ce3dc91..39234eba53 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -133,7 +133,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, } dnode_rele(dn, FTAG); - *dbp = &db->db; + *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } @@ -144,31 +144,64 @@ dmu_bonus_max(void) } int -dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); - if (newsize < 0 || newsize > db->db_size) - return (EINVAL); - dnode_setbonuslen(dn, newsize, tx); - return (0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = EINVAL; + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = EINVAL; + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); } int -dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx) +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (type > DMU_OT_NUMTYPES) - return (EINVAL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (type > DMU_OT_NUMTYPES) { + error = EINVAL; + } else if (dn->dn_bonus != db) { + error = EINVAL; + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); + DB_DNODE_EXIT(db); + return (error); +} - dnode_setbonus_type(dn, type, tx); - return (0); +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); } int @@ -208,11 +241,19 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dbuf_create_bonus(dn); } db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); /* as long as the bonus buf is held, the dnode will be held */ - if (refcount_add(&db->db_holds, tag) == 1) + if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); + (void) atomic_inc_32_nv(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -257,28 +298,45 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { - dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; int err; - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) - return (EINVAL); - rw_enter(&dn->dn_struct_rwlock, RW_READER); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = EINVAL; + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = ENOENT; + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } - if (!dn->dn_have_spill) { rw_exit(&dn->dn_struct_rwlock); - return (ENOENT); } - err = dmu_spill_hold_by_dnode(dn, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); - rw_exit(&dn->dn_struct_rwlock); + + DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { - return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode, - DB_RF_CANFAIL, tag, dbp)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); } /* @@ -400,14 +458,18 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, } int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; int err; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); return (err); } @@ -440,7 +502,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; @@ -1001,11 +1063,19 @@ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + if (size == 0) return (0); - return (dmu_write_uio_dnode(((dmu_buf_impl_t *)zdb)->db_dnode, - uio, size, tx)); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); } int @@ -1091,9 +1161,11 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + spa_t *spa; - return (arc_loan_buf(dn->dn_objset->os_spa, size)); + DB_GET_SPA(&spa, db); + return (arc_loan_buf(spa, size)); } /* @@ -1115,23 +1187,35 @@ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(dbuf); if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { + objset_t *os; + uint64_t object; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + os = dn->dn_objset; + object = dn->dn_object; + DB_DNODE_EXIT(dbuf); + dbuf_rele(db, FTAG); - dmu_write(dn->dn_objset, dn->dn_object, offset, blksz, - buf->b_data, tx); + dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } @@ -1150,7 +1234,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { @@ -1161,7 +1244,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) */ BP_SET_LSIZE(bp, db->db_size); } else { - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } @@ -1284,6 +1366,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_sync_arg_t *dsa; zbookmark_t zb; zio_prop_t zp; + dnode_t *dn; ASSERT(pio != NULL); ASSERT(BP_IS_HOLE(bp)); @@ -1292,7 +1375,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); - dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. @@ -1574,9 +1660,13 @@ dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) * As above, but faster; can be used when you have a held dbuf in hand. */ void -dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { - dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); } /* @@ -1584,14 +1674,20 @@ dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) * This is specifically optimized for zfs_getattr(). */ void -dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; + DB_DNODE_EXIT(db); } void @@ -1643,23 +1739,25 @@ void dmu_init(void) { zfs_dbgmsg_init(); - dbuf_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); dnode_init(); + dbuf_init(); zfetch_init(); arc_init(); l2arc_init(); - xuio_stat_init(); - sa_cache_init(); } void dmu_fini(void) { + l2arc_fini(); arc_fini(); zfetch_fini(); - dnode_fini(); dbuf_fini(); - l2arc_fini(); + dnode_fini(); + dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index 98228d4035..8dff460489 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -33,7 +33,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, { uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; @@ -49,7 +49,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(os->os_meta_dnode, + int error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; @@ -187,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; - error = dnode_next_offset(os->os_meta_dnode, + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index a6d9b7a54a..5554bda8f9 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -41,9 +41,26 @@ #include <sys/zil.h> #include <sys/dmu_impl.h> #include <sys/zfs_ioctl.h> -#include <sys/sunddi.h> #include <sys/sa.h> +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} + spa_t * dmu_objset_spa(objset_t *os) { @@ -368,13 +385,16 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - os->os_meta_dnode = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - os->os_userused_dnode = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); - os->os_groupused_dnode = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); } /* @@ -470,8 +490,8 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, os->os_meta_dnode); - list_insert_tail(&os->os_dnodes, os->os_meta_dnode); + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance @@ -497,8 +517,9 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); dn = next_dn; } + dn = list_head(&os->os_dnodes); mutex_exit(&os->os_lock); - return (list_head(&os->os_dnodes) != os->os_meta_dnode); + return (dn != DMU_META_DNODE(os)); } void @@ -539,16 +560,26 @@ dmu_objset_evict(objset_t *os) */ (void) dmu_objset_evict_dbufs(os); - dnode_special_close(os->os_meta_dnode); - if (os->os_userused_dnode) { - dnode_special_close(os->os_userused_dnode); - dnode_special_close(os->os_groupused_dnode); + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); + mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); @@ -575,7 +606,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os)); if (ds) mutex_exit(&ds->ds_opening_lock); - mdn = os->os_meta_dnode; + mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -1035,17 +1066,17 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) /* * Sync special dnodes - the parent IO for the sync is the root block */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != DMU_OT_NONE) { - os->os_userused_dnode->dn_zio = zio; - dnode_sync(os->os_userused_dnode, tx); - os->os_groupused_dnode->dn_zio = zio; - dnode_sync(os->os_groupused_dnode, tx); + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; @@ -1063,7 +1094,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT(dr->dr_dbuf->db_level == 0); list_remove(list, dr); @@ -1085,7 +1116,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } -objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) @@ -1097,8 +1128,8 @@ boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] && - os->os_userused_dnode); + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); } static void @@ -1132,7 +1163,7 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ - if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); @@ -1201,13 +1232,23 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr->dr_txg == tx->tx_txg) break; - if (dr == NULL) + if (dr == NULL) { data = NULL; - else if (dr->dr_dbuf->db_dnode->dn_bonuslen == 0 && - dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; - else - data = dr->dt.dl.dr_data; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + return (data); } diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 5fc062c16b..bd5c71a226 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -186,7 +186,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, ASSERT(level != 0); db = NULL; } else { - ASSERT(db->db_dnode == dn); + ASSERT(DB_DNODE(db) == dn); ASSERT(db->db_level == level); ASSERT(db->db.db_size == space); ASSERT(db->db_blkid == blkid); @@ -384,7 +384,7 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); @@ -787,18 +787,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { @@ -870,9 +876,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); @@ -1355,9 +1364,19 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - if (sa->sa_force_spill || may_grow || hdl->sa_spill || - ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) { + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); } } diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 2b44cd2c96..850dd5816b 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -38,19 +38,33 @@ static int free_range_compar(const void *node1, const void *node2); static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { - int i; dnode_t *dn = arg; - bzero(dn, sizeof (dnode_t)); + int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -59,8 +73,18 @@ dnode_cons(void *arg, void *unused, int kmflag) refcount_create(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); avl_create(&dn->dn_ranges[i], free_range_compar, sizeof (free_range_t), offsetof(struct free_range, fr_node)); @@ -69,9 +93,27 @@ dnode_cons(void *arg, void *unused, int kmflag) offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + dn->dn_moved = 0; return (0); } @@ -88,27 +130,56 @@ dnode_dest(void *arg, void *unused) cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); avl_destroy(&dn->dn_ranges[i]); list_destroy(&dn->dn_dirty_records[i]); + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); + ASSERT3U(dn->dn_next_nlevels[i], ==, 0); + ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_blksz[i], ==, 0); } + ASSERT3U(dn->dn_allocated_txg, ==, 0); + ASSERT3U(dn->dn_free_txg, ==, 0); + ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT3U(dn->dn_dirtyctx, ==, 0); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT3U(dn->dn_oldused, ==, 0); + ASSERT3U(dn->dn_oldflags, ==, 0); + ASSERT3U(dn->dn_olduid, ==, 0); + ASSERT3U(dn->dn_oldgid, ==, 0); + ASSERT3U(dn->dn_newuid, ==, 0); + ASSERT3U(dn->dn_newgid, ==, 0); + ASSERT3U(dn->dn_id_flags, ==, 0); + + ASSERT3U(dn->dn_dbufs_count, ==, 0); list_destroy(&dn->dn_dbufs); } void dnode_init(void) { + ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; } @@ -120,6 +191,7 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); @@ -298,18 +370,29 @@ dnode_setdblksz(dnode_t *dn, int size) static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object) + uint64_t object, dnode_handle_t *dnh) { dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - (void) dnode_cons(dn, NULL, 0); /* XXX */ - dn->dn_objset = os; + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ dn->dn_object = object; dn->dn_dbuf = db; + dn->dn_handle = dnh; dn->dn_phys = dnp; - if (dnp->dn_datablkszsec) + if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; @@ -325,45 +408,65 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); + membar_producer(); + /* + * Everything else must be valid before assigning dn_objset makes the + * dnode eligible for dnode_move(). + */ + dn->dn_objset = os; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } +/* + * Caller must be holding the dnode handle, which is released upon return. + */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; -#ifdef ZFS_DEBUG - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); - ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); - } - ASSERT(NULL == list_head(&dn->dn_dbufs)); -#endif ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); list_remove(&os->os_dnodes, dn); mutex_exit(&os->os_lock); - if (dn->dn_dirtyctx_firstset) { + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } - dmu_zfetch_rele(&dn->dn_zfetch); - if (dn->dn_bonus) { + if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } @@ -408,6 +511,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); @@ -522,9 +626,304 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, mutex_exit(&dn->dn_mtx); } +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(list_is_empty(&ndn->dn_dbufs)); + list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; + ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_ranges[i].avl_root = NULL; + odn->dn_ranges[i].avl_numnodes = 0; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef _KERNEL +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ + void -dnode_special_close(dnode_t *dn) +dnode_special_close(dnode_handle_t *dnh) { + dnode_t *dn = dnh->dnh_dnode; + /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that @@ -533,13 +932,19 @@ dnode_special_close(dnode_t *dn) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } dnode_t * -dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object); + dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); + dnh->dnh_dnode = dn; + zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); return (dn); } @@ -547,34 +952,43 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) static void dnode_buf_pageout(dmu_buf_t *db, void *arg) { - dnode_t **children_dnodes = arg; + dnode_children_t *children_dnodes = arg; int i; int epb = db->db_size >> DNODE_SHIFT; + ASSERT(epb == children_dnodes->dnc_count); + for (i = 0; i < epb; i++) { - dnode_t *dn = children_dnodes[i]; - int n; + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; - if (dn == NULL) + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); continue; -#ifdef ZFS_DEBUG + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligable for eviction and this function + * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); - ASSERT(list_head(&dn->dn_dbufs) == NULL); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - for (n = 0; n < TXG_SIZE; n++) - ASSERT(!list_link_active(&dn->dn_dirty_link[n])); -#endif - children_dnodes[i] = NULL; - dnode_destroy(dn); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); } /* @@ -593,7 +1007,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_t **children_dnodes; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't @@ -607,7 +1022,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? - os->os_userused_dnode : os->os_groupused_dnode; + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (ENOENT); type = dn->dn_type; @@ -624,7 +1039,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); - mdn = os->os_meta_dnode; + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); @@ -651,26 +1067,39 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, idx = object & (epb-1); + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { - dnode_t **winner; - children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), - KM_SLEEP); + int i; + dnode_children_t *winner; + children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + dnh[i].dnh_dnode = NULL; + } if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, dnode_buf_pageout)) { - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + (epb - 1) * sizeof (dnode_handle_t)); children_dnodes = winner; } } + ASSERT(children_dnodes->dnc_count == epb); - if ((dn = children_dnodes[idx]) == NULL) { - dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + if ((dn = dnh->dnh_dnode) == NULL) { + dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, dnp, db, object); - winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); + dn = dnode_create(os, phys, db, object, dnh); + winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); if (winner != NULL) { - dnode_destroy(dn); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ dn = winner; } } @@ -682,13 +1111,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, ((flag & DNODE_MUST_BE_FREE) && (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } mutex_exit(&dn->dn_mtx); if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dn); + dbuf_add_ref(db, dnh); + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -730,13 +1162,37 @@ void dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && dn->dn_dbuf) - dbuf_rele(dn->dn_dbuf, dn); + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + dbuf_rele(db, dnh); + } } void @@ -755,7 +1211,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif @@ -794,7 +1250,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child - * dbuf maintaines a hold on the dnode. When the last child + * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index f9ec9f6023..2ee990a3b3 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -76,7 +76,11 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) if (child == NULL) continue; - ASSERT3P(child->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != @@ -135,15 +139,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; + dnode_t *dn; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<<epbs); num = end - start + 1; ASSERT3U(off, >=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); @@ -155,10 +162,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); @@ -200,6 +207,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) dbuf_rele(child, FTAG); } + DB_DNODE_EXIT(db); } #endif @@ -209,7 +217,7 @@ static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; @@ -230,7 +238,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; @@ -253,6 +263,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + DB_DNODE_EXIT(db); return (all ? ALL : blocks_freed); } @@ -272,6 +283,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } dbuf_rele(subdb, FTAG); } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); #ifdef ZFS_DEBUG bp -= (end-start)+1; @@ -375,7 +387,11 @@ dnode_evict_dbufs(dnode_t *dn) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); - ASSERT3P(db->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index 8358b4ceeb..600132f080 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -25,7 +25,7 @@ #include <sys/zfs_context.h> #include <sys/refcount.h> -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG #ifdef _KERNEL int reference_tracking_enable = FALSE; /* runs out of memory too easily */ @@ -189,4 +189,35 @@ refcount_remove(refcount_t *rc, void *holder) return (refcount_remove_many(rc, 1, holder)); } -#endif +void +refcount_transfer(refcount_t *dst, refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +#endif /* ZFS_DEBUG */ diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 403fd86a35..4cb4546b25 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -1612,6 +1612,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; @@ -1629,7 +1631,9 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* First make of copy of the old data */ - if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) { + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); bcopy(hdl->sa_bonus->db_data, old_data[0], @@ -1638,6 +1642,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, } else { old_data[0] = NULL; } + DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 4c05806e3e..cf1bbc030f 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -32,6 +32,7 @@ #include <sys/arc.h> #include <sys/zfs_context.h> #include <sys/refcount.h> +#include <sys/zrlock.h> #ifdef __cplusplus extern "C" { @@ -82,9 +83,6 @@ struct dmu_tx; * etc. */ -#define LIST_LINK_INACTIVE(link) \ - ((link)->list_next == NULL && (link)->list_prev == NULL) - struct dmu_buf_impl; typedef enum override_states { @@ -149,15 +147,17 @@ typedef struct dmu_buf_impl { struct objset *db_objset; /* - * the dnode we belong to (NULL when evicted) + * handle to safely access the dnode we belong to (NULL when evicted) */ - struct dnode *db_dnode; + struct dnode_handle *db_dnode_handle; /* * our parent buffer; if the dnode points to us directly, - * db_parent == db_dnode->dn_dbuf + * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf * only accessed by sync thread ??? * (NULL when evicted) + * May change from NULL to non-NULL under the protection of db_mtx + * (see dbuf_check_blkptr()) */ struct dmu_buf_impl *db_parent; @@ -284,24 +284,46 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); +#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) +#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) +#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#define DB_GET_SPA(_spa_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_spa_p) = __dn->dn_objset->os_spa; \ + DB_DNODE_EXIT(_db); \ +} +#define DB_GET_OBJSET(_os_p, _db) { \ + dnode_t *__dn; \ + DB_DNODE_ENTER(_db); \ + __dn = DB_DNODE(_db); \ + *(_os_p) = __dn->dn_objset; \ + DB_DNODE_EXIT(_db); \ +} + void dbuf_init(void); void dbuf_fini(void); -#define DBUF_IS_METADATA(db) \ - ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) +boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); + +#define DBUF_IS_METADATA(_db) \ + (dbuf_is_metadata(_db)) -#define DBUF_GET_BUFC_TYPE(db) \ - (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) +#define DBUF_GET_BUFC_TYPE(_db) \ + (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -#define DBUF_IS_CACHEABLE(db) \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_CACHEABLE(_db) \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(db) \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(db) && \ - ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +#define DBUF_IS_L2CACHEABLE(_db) \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(_db) && \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #ifdef ZFS_DEBUG @@ -332,7 +354,7 @@ _NOTE(CONSTCOND) } while (0) sprintf_blkptr(__blkbuf, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ + } \ _NOTE(CONSTCOND) } while (0) #define DBUF_VERIFY(db) dbuf_verify(db) diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index b6061cfffc..c504c23310 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -335,6 +335,7 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index 5c5119a207..12d5c4ddd4 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -40,6 +40,8 @@ extern "C" { #endif +extern krwlock_t os_lock; + struct dsl_dataset; struct dmu_tx; @@ -68,9 +70,15 @@ struct objset { spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; - dnode_t *os_meta_dnode; - dnode_t *os_userused_dnode; - dnode_t *os_groupused_dnode; + /* + * The following "special" dnodes have no parent and are exempt from + * dnode_move(), but they root their descendents in this objset using + * handles anyway, so that all access to dnodes from dbufs consistently + * uses handles. + */ + dnode_handle_t os_meta_dnode; + dnode_handle_t os_userused_dnode; + dnode_handle_t os_groupused_dnode; zilog_t *os_zil; /* can change, under dsl_dir's locks: */ @@ -113,6 +121,9 @@ struct objset { #define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) +#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) +#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) +#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ @@ -161,6 +172,9 @@ boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); +void dmu_objset_init(void); +void dmu_objset_fini(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 8bae1602e7..9ad4be36bf 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -32,6 +32,7 @@ #include <sys/zio.h> #include <sys/refcount.h> #include <sys/dmu_zfetch.h> +#include <sys/zrlock.h> #ifdef __cplusplus extern "C" { @@ -156,6 +157,7 @@ typedef struct dnode { struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; + struct dnode_handle *dn_handle; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ /* @@ -172,6 +174,7 @@ typedef struct dnode { uint8_t dn_nlevels; uint8_t dn_indblkshift; uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint8_t dn_moved; /* Has this dnode been moved? */ uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; @@ -183,6 +186,9 @@ typedef struct dnode { uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ + uint32_t dn_dbufs_count; /* count of dn_dbufs */ + /* protected by os_lock: */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ @@ -202,8 +208,11 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - list_t dn_dbufs; /* linked list of descendent dbuf_t's */ + list_t dn_dbufs; /* descendent dbufs */ + + /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ @@ -220,6 +229,22 @@ typedef struct dnode { struct zfetch dn_zfetch; } dnode_t; +/* + * Adds a level of indirection between the dbuf and the dnode to avoid + * iterating descendent dbufs in dnode_move(). Handles are not allocated + * individually, but as an array of child dnodes in dnode_hold_impl(). + */ +typedef struct dnode_handle { + /* Protects dnh_dnode from modification by dnode_move(). */ + zrlock_t dnh_zrlock; + dnode_t *dnh_dnode; +} dnode_handle_t; + +typedef struct dnode_children { + size_t dnc_count; /* number of children */ + dnode_handle_t dnc_children[1]; /* sized dynamically */ +} dnode_children_t; + typedef struct free_range { avl_node_t fr_node; uint64_t fr_blkid; @@ -227,8 +252,8 @@ typedef struct free_range { } free_range_t; dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, - uint64_t object); -void dnode_special_close(dnode_t *dn); + uint64_t object, dnode_handle_t *dnh); +void dnode_special_close(dnode_handle_t *dnh); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index bc3ade80f1..1752c64e3e 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -40,7 +40,7 @@ extern "C" { */ #define FTAG ((char *)__func__) -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG typedef struct reference { list_node_t ref_link; void *ref_holder; @@ -67,11 +67,12 @@ int64_t refcount_add(refcount_t *rc, void *holder_tag); int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); +void refcount_transfer(refcount_t *dst, refcount_t *src); void refcount_init(void); void refcount_fini(void); -#else /* DEBUG */ +#else /* ZFS_DEBUG */ typedef struct refcount { uint64_t rc_count; @@ -97,7 +98,7 @@ typedef struct refcount { #define refcount_init() #define refcount_fini() -#endif /* DEBUG */ +#endif /* ZFS_DEBUG */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h index 62497e7025..6661e47cfc 100644 --- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -232,7 +231,7 @@ struct sa_handle { ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) #define SA_BONUSTYPE_FROM_DB(db) \ - (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype) + (dmu_get_bonustype((dmu_buf_t *)db)) #define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index 4781ee6862..e5257b89e5 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_ZNODE_H @@ -188,6 +187,7 @@ typedef struct znode { uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ + uint8_t z_moved; /* Has this znode been moved? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ diff --git a/usr/src/uts/common/fs/zfs/sys/zrlock.h b/usr/src/uts/common/fs/zfs/sys/zrlock.h new file mode 100644 index 0000000000..dcd63f7b5b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zrlock.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZRLOCK_H +#define _SYS_ZRLOCK_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zrlock { + kmutex_t zr_mtx; + volatile int32_t zr_refcount; + kcondvar_t zr_cv; + uint16_t zr_pad; +#ifdef ZFS_DEBUG + kthread_t *zr_owner; + const char *zr_caller; +#endif +} zrlock_t; + +extern void zrl_init(zrlock_t *); +extern void zrl_destroy(zrlock_t *); +#ifdef ZFS_DEBUG +#define zrl_add(_z) zrl_add_debug((_z), __func__) +extern void zrl_add_debug(zrlock_t *, const char *); +#else +extern void zrl_add(zrlock_t *); +#endif +extern void zrl_remove(zrlock_t *); +extern int zrl_tryenter(zrlock_t *); +extern void zrl_exit(zrlock_t *); +extern int zrl_is_zero(zrlock_t *); +extern int zrl_is_locked(zrlock_t *); +#ifdef ZFS_DEBUG +extern kthread_t *zrl_owner(zrlock_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZRLOCK_H */ diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 3c04360a13..f6558f60a5 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -81,9 +81,6 @@ #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ -#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) -#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) - /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies @@ -136,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; + zp->z_moved = 0; return (0); } @@ -228,6 +226,12 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) */ ozp->z_sa_hdl = NULL; POINTER_INVALIDATE(&ozp->z_zfsvfs); + + /* + * Mark the znode. + */ + nzp->z_moved = 1; + ozp->z_moved = (uint8_t)-1; } /*ARGSUSED*/ @@ -478,6 +482,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) vattr.va_gid = crgetgid(kcred); sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; @@ -627,6 +633,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT(zp->z_dirlocks == NULL); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; /* * Defer setting z_zfsvfs until the znode is ready to be a candidate for @@ -759,7 +766,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; - uint64_t dzp_pflags = 0; + uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; @@ -794,7 +801,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ /* * There's currently no mechanism for pre-reading the blocks that will - * be to needed allocate a new object, so we accept the small chance + * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ @@ -1807,6 +1814,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); @@ -1843,7 +1852,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); - ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_zfsvfs = &zfsvfs; VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); diff --git a/usr/src/uts/common/fs/zfs/zrlock.c b/usr/src/uts/common/fs/zfs/zrlock.c new file mode 100644 index 0000000000..ec94b08555 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zrlock.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * A Zero Reference Lock (ZRL) is a reference count that can lock out new + * references only when the count is zero and only without waiting if the count + * is not already zero. It is similar to a read-write lock in that it allows + * multiple readers and only a single writer, but it does not allow a writer to + * block while waiting for readers to exit, and therefore the question of + * reader/writer priority is moot (no WRWANT bit). Since the equivalent of + * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it + * is perfectly safe for the same reader to acquire the same lock multiple + * times. The fact that a ZRL is reentrant for readers (through multiple calls + * to zrl_add()) makes it convenient for determining whether something is + * actively referenced without the fuss of flagging lock ownership across + * function calls. + */ +#include <sys/zrlock.h> + +/* + * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is + * treated as zero references. + */ +#define ZRL_LOCKED ((uint32_t)-1) +#define ZRL_DESTROYED -2 + +void +zrl_init(zrlock_t *zrl) +{ + mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); + zrl->zr_refcount = 0; + cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); +#ifdef ZFS_DEBUG + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; +#endif +} + +void +zrl_destroy(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount == 0); + + mutex_destroy(&zrl->zr_mtx); + zrl->zr_refcount = ZRL_DESTROYED; + cv_destroy(&zrl->zr_cv); +} + +void +#ifdef ZFS_DEBUG +zrl_add_debug(zrlock_t *zrl, const char *zc) +#else +zrl_add(zrlock_t *zrl) +#endif +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + while (n != ZRL_LOCKED) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, n, n + 1); + if (cas == n) { + ASSERT((int32_t)n >= 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + DTRACE_PROBE2(zrlock__reentry, + zrlock_t *, zrl, uint32_t, n); + } + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + return; + } + n = cas; + } + + mutex_enter(&zrl->zr_mtx); + while (zrl->zr_refcount == ZRL_LOCKED) { + cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + } + ASSERT(zrl->zr_refcount >= 0); + zrl->zr_refcount++; +#ifdef ZFS_DEBUG + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + mutex_exit(&zrl->zr_mtx); +} + +void +zrl_remove(zrlock_t *zrl) +{ + uint32_t n; + + n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); + ASSERT((int32_t)n >= 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; + } +#endif +} + +int +zrl_tryenter(zrlock_t *zrl) +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + if (n == 0) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); + if (cas == 0) { +#ifdef ZFS_DEBUG + ASSERT(zrl->zr_owner == NULL); + zrl->zr_owner = curthread; +#endif + return (1); + } + } + + ASSERT((int32_t)n > ZRL_DESTROYED); + + return (0); +} + +void +zrl_exit(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount == ZRL_LOCKED); + + mutex_enter(&zrl->zr_mtx); +#ifdef ZFS_DEBUG + ASSERT(zrl->zr_owner == curthread); + zrl->zr_owner = NULL; + membar_producer(); /* make sure the owner store happens first */ +#endif + zrl->zr_refcount = 0; + cv_broadcast(&zrl->zr_cv); + mutex_exit(&zrl->zr_mtx); +} + +int +zrl_refcount(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + int n = (int)zrl->zr_refcount; + return (n <= 0 ? 0 : n); +} + +int +zrl_is_zero(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + return (zrl->zr_refcount <= 0); +} + +int +zrl_is_locked(zrlock_t *zrl) +{ + ASSERT(zrl->zr_refcount > ZRL_DESTROYED); + + return (zrl->zr_refcount == ZRL_LOCKED); +} + +#ifdef ZFS_DEBUG +kthread_t * +zrl_owner(zrlock_t *zrl) +{ + return (zrl->zr_owner); +} +#endif diff --git a/usr/src/uts/common/sys/dnlc.h b/usr/src/uts/common/sys/dnlc.h index c58de9c011..070506ee31 100644 --- a/usr/src/uts/common/sys/dnlc.h +++ b/usr/src/uts/common/sys/dnlc.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -39,8 +38,6 @@ #ifndef _SYS_DNLC_H #define _SYS_DNLC_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -163,7 +160,8 @@ struct nc_stats { */ #define DNLCHASH(name, dvp, hash, namlen) \ { \ - char Xc, *Xcp; \ + char Xc; \ + const char *Xcp; \ hash = (int)((uintptr_t)(dvp)) >> 8; \ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ (hash) = ((hash) << 4) + (hash) + Xc; \ @@ -181,13 +179,13 @@ extern vnode_t negative_cache_vnode; #define DNLC_NO_VNODE &negative_cache_vnode void dnlc_init(void); -void dnlc_enter(vnode_t *, char *, vnode_t *); -void dnlc_update(vnode_t *, char *, vnode_t *); -vnode_t *dnlc_lookup(vnode_t *, char *); +void dnlc_enter(vnode_t *, const char *, vnode_t *); +void dnlc_update(vnode_t *, const char *, vnode_t *); +vnode_t *dnlc_lookup(vnode_t *, const char *); void dnlc_purge(void); void dnlc_purge_vp(vnode_t *); int dnlc_purge_vfsp(vfs_t *, int); -void dnlc_remove(vnode_t *, char *); +void dnlc_remove(vnode_t *, const char *); int dnlc_fs_purge1(struct vnodeops *); vnode_t *dnlc_reverse_lookup(vnode_t *, char *, size_t); void dnlc_reduce_cache(void *); @@ -296,7 +294,7 @@ dcret_t dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries); * For example, "handle" for ufs holds the inumber and a directory * entry offset. Returns DOK, DNOCACHE, DTOOBIG. */ -dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle); +dcret_t dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle); /* * dnlc_dir_add_space adds free space (length and file system specific @@ -322,21 +320,22 @@ void dnlc_dir_purge(dcanchor_t *dcap); * and returns the file system handle specified on dnlc_dir_add_entry() * in "handlep". Returns DFOUND, DNOENT, DNOCACHE. */ -dcret_t dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handlep); +dcret_t dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handlep); /* * dnlc_dir_update() amends the handle for an entry in a directory cache * "handle" is the new file system specific handle for the file "name". * Returns DFOUND, DNOENT, DNOCACHE. */ -dcret_t dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle); +dcret_t dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle); /* * dnlc_dir_rem_entry() removes an entry form a directory cache. * Returns the handle if "handlep" non null. * Returns DFOUND, DNOENT, DNOCACHE. */ -dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep); +dcret_t dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, + uint64_t *handlep); /* * dnlc_dir_rem_space_by_len() looks up and returns free space in a diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h index 3a37f63fa2..03d4d24ba3 100644 --- a/usr/src/uts/common/sys/kmem.h +++ b/usr/src/uts/common/sys/kmem.h @@ -95,6 +95,15 @@ typedef enum kmem_cbrc { #ifdef _KERNEL +/* + * Helps clients implementing the move() callback to recognize known objects by + * testing a client-designated pointer member. Takes advantage of the fact that + * any scribbling to freed memory done by kmem is guaranteed to set one of the + * two low order bits. + */ +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + extern int kmem_ready; extern pgcnt_t kmem_reapahead; diff --git a/usr/src/uts/intel/io/dktp/dcdev/dadk.c b/usr/src/uts/intel/io/dktp/dcdev/dadk.c index 12bf34b65d..3fd4477fd1 100644 --- a/usr/src/uts/intel/io/dktp/dcdev/dadk.c +++ b/usr/src/uts/intel/io/dktp/dcdev/dadk.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -868,11 +867,7 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag, sizeof (struct dk_callback), KM_SLEEP); bcopy(dkc, dkc2, sizeof (*dkc2)); - /* - * Borrow b_list to carry private data - * to the b_iodone func. - */ - bp->b_list = (struct buf *)dkc2; + bp->b_private = dkc2; bp->b_iodone = dadk_flushdone; is_sync = 0; } @@ -988,7 +983,7 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag, int dadk_flushdone(struct buf *bp) { - struct dk_callback *dkc = (struct dk_callback *)bp->b_list; + struct dk_callback *dkc = bp->b_private; ASSERT(dkc != NULL && dkc->dkc_callback != NULL); diff --git a/usr/src/uts/sun/io/dada/targets/dad.c b/usr/src/uts/sun/io/dada/targets/dad.c index 1d71904da5..b3be5f3ea5 100644 --- a/usr/src/uts/sun/io/dada/targets/dad.c +++ b/usr/src/uts/sun/io/dada/targets/dad.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -3465,6 +3464,7 @@ dcdioctl(dev_t dev, int cmd, intptr_t arg, int flag, bp->b_un.b_addr = 0; bp->b_iodone = NULL; bp->b_list = NULL; + bp->b_private = NULL; if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback != NULL) { @@ -3472,7 +3472,7 @@ dcdioctl(dev_t dev, int cmd, intptr_t arg, int flag, kmem_zalloc(sizeof (*dkc2), KM_SLEEP); bcopy(dkc, dkc2, sizeof (*dkc2)); - bp->b_list = (struct buf *)dkc2; + bp->b_private = dkc2; bp->b_iodone = dcdflushdone; is_sync = 0; } @@ -3500,7 +3500,7 @@ dcdflushdone(struct buf *bp) struct dcd_disk *un = ddi_get_soft_state(dcd_state, DCDUNIT(bp->b_edev)); struct dcd_pkt *pkt = BP_PKT(bp); - struct dk_callback *dkc = (struct dk_callback *)bp->b_list; + struct dk_callback *dkc = bp->b_private; ASSERT(un != NULL); ASSERT(bp == un->un_sbufp); @@ -3514,7 +3514,7 @@ dcdflushdone(struct buf *bp) (*dkc->dkc_callback)(dkc->dkc_cookie, geterror(bp)); kmem_free(dkc, sizeof (*dkc)); bp->b_iodone = NULL; - bp->b_list = NULL; + bp->b_private = NULL; } /* |