summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
authormaybee <none@none>2007-08-26 11:19:04 -0700
committermaybee <none@none>2007-08-26 11:19:04 -0700
commit1934e92fc930c49429ad71a8ca97340f33227e78 (patch)
tree80cf4c2e87b71ea83603ec7087e5c7d837b21df7 /usr/src/uts/common/fs
parentf2cd0f027a2c872a297e602d646cc147ce718534 (diff)
downloadillumos-gate-1934e92fc930c49429ad71a8ca97340f33227e78.tar.gz
6569719 panic dangling dbufs (dn=ffffffff28814d30, dbuf=ffffffff20756008)
6573361 panic turnstile_block, unowned mutex 6584864 $MOS is not properly bounded by pool size 6585265 need bonus resize interface 6587723 BAD TRAP: type=e (#pf Page fault) occurred in module "zfs" due to a NULL pointer dereference 6589799 dangling dbuf after zinject 6594025 panic: dangling dbufs during shutdown
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c97
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c38
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c21
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c86
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c34
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c18
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c4
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h7
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c7
15 files changed, 176 insertions, 168 deletions
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 3ee7e0b21e..e55bb059b4 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -307,7 +307,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -468,13 +468,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ int bonuslen = db->db_dnode->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
+ if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
+ bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -781,31 +783,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
if (db->db_last_dirty)
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
+ /* If we don't exist or are in a snapshot, we can't be freed */
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
else
- return (TRUE);
+ return (FALSE);
}
void
@@ -964,6 +963,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ */
+ if (dbuf_block_freeable(db)) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
@@ -1013,25 +1033,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_freed_in_flight = FALSE;
}
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
/*
* This buffer is now part of this txg
*/
@@ -1297,6 +1298,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
if (db->db_buf)
@@ -1397,7 +1399,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
@@ -1471,29 +1475,23 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
- boolean_t need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
- if (need_mutex)
- mutex_enter(&dn->dn_dbufs_mtx);
-
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (list_link_active(&db->db_link)) {
- ASSERT(need_mutex);
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
- } else if (need_mutex) {
- mutex_exit(&dn->dn_dbufs_mtx);
+ db->db_dnode = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
- db->db_dnode = NULL;
db->db_buf = NULL;
ASSERT(!list_link_active(&db->db_link));
@@ -1662,16 +1660,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
return (err ? NULL : db);
}
-dmu_buf_impl_t *
+void
dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1919,11 +1914,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
if (db->db_blkid == DB_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
+
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 4fbba8ab92..b41458bd15 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -119,6 +119,19 @@ dmu_bonus_max(void)
return (DN_MAX_BONUSLEN);
}
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -126,27 +139,27 @@ int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
- int err, count;
dmu_buf_impl_t *db;
+ int error;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
+ dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -388,7 +401,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -397,7 +409,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp);
if (err)
- return (err);
+ break;
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -418,7 +430,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
void
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index fa03d16a15..eaaaf29223 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -309,7 +309,7 @@ dmu_objset_close(objset_t *os)
}
int
-dmu_objset_evict_dbufs(objset_t *os, boolean_t try)
+dmu_objset_evict_dbufs(objset_t *os)
{
objset_impl_t *osi = os->os;
dnode_t *dn;
@@ -327,34 +327,25 @@ dmu_objset_evict_dbufs(objset_t *os, boolean_t try)
* skip.
*/
for (dn = list_head(&osi->os_dnodes);
- dn && refcount_is_zero(&dn->dn_holds);
+ dn && !dnode_add_ref(dn, FTAG);
dn = list_next(&osi->os_dnodes, dn))
continue;
- if (dn)
- dnode_add_ref(dn, FTAG);
while (dn) {
dnode_t *next_dn = dn;
do {
next_dn = list_next(&osi->os_dnodes, next_dn);
- } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
- if (next_dn)
- dnode_add_ref(next_dn, FTAG);
+ } while (next_dn && !dnode_add_ref(next_dn, FTAG));
mutex_exit(&osi->os_lock);
- if (dnode_evict_dbufs(dn, try)) {
- dnode_rele(dn, FTAG);
- if (next_dn)
- dnode_rele(next_dn, FTAG);
- return (1);
- }
+ dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&osi->os_lock);
dn = next_dn;
}
mutex_exit(&osi->os_lock);
- return (0);
+ return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
}
void
@@ -383,7 +374,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
* nothing can be added to the list at this point.
*/
os.os = osi;
- (void) dmu_objset_evict_dbufs(&os, 0);
+ (void) dmu_objset_evict_dbufs(&os);
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 4107286fd4..d1b5cc1ecc 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -610,13 +610,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (data == NULL) {
dmu_tx_commit(tx);
return (ra->err);
}
- bcopy(data, db->db_data, db->db_size);
+ bcopy(data, db->db_data, drro->drr_bonuslen);
if (ra->byteswap) {
dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
drro->drr_bonuslen);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 8e3ded7f94..cca94cfa50 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -240,6 +240,23 @@ free_range_compar(const void *node1, const void *node2)
else return (0);
}
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -363,6 +380,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -390,6 +408,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -397,7 +416,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int i;
+ int i, old_nblkptr;
dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -414,7 +433,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -437,38 +456,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
}
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
rw_exit(&dn->dn_struct_rwlock);
- if (db) {
+ if (db)
dbuf_rele(db, FTAG);
- db = NULL;
- }
/* change type */
dn->dn_type = ot;
- if (dn->dn_bonuslen != bonuslen) {
- /* change bonus size */
- if (bonuslen == 0)
- bonuslen = 1; /* XXX */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- if (refcount_add(&db->db_holds, FTAG) == 1)
- dnode_add_ref(dn, db);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT(db->db.db_data != NULL);
- db->db.db_size = bonuslen;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- }
-
/* change bonus size and type */
mutex_enter(&dn->dn_mtx);
+ old_nblkptr = dn->dn_nblkptr;
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -476,12 +475,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- /*
- * NB: we have to do the dbuf_rele after we've changed the
- * dn_bonuslen, for the sake of dbuf_verify().
- */
- if (db)
- dbuf_rele(db, FTAG);
+ /* XXX - for now, we can't make nblkptr smaller */
+ ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+ /* fix up the bonus db_size if dn_nblkptr has changed */
+ if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+ dn->dn_bonus->db.db_size =
+ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
@@ -646,11 +648,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
dnode_add_ref(dnode_t *dn, void *tag)
{
- ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, tag);
+ mutex_enter(&dn->dn_mtx);
+ if (refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
}
void
@@ -658,7 +671,9 @@ dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
+ mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && dn->dn_dbuf)
dbuf_rele(dn->dn_dbuf, dn);
@@ -694,6 +709,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -716,7 +732,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
* dnode will hang around after we finish processing its
* children.
*/
- dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
(void) dbuf_dirty(dn->dn_dbuf, tx);
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 135adcfde6..8cc30cb811 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -349,8 +349,8 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
/*
* Try to kick all the dnodes dbufs out of the cache...
*/
-int
-dnode_evict_dbufs(dnode_t *dn, boolean_t try)
+void
+dnode_evict_dbufs(dnode_t *dn)
{
int progress;
int pass = 0;
@@ -397,21 +397,6 @@ dnode_evict_dbufs(dnode_t *dn, boolean_t try)
ASSERT(pass < 100); /* sanity check */
} while (progress);
- /*
- * This function works fine even if it can't evict everything.
- * If were only asked to try to evict everything then
- * return an error if we can't. Otherwise panic as the caller
- * expects total eviction.
- */
- if (list_head(&dn->dn_dbufs) != NULL) {
- if (try) {
- return (1);
- } else {
- panic("dangling dbufs (dn=%p, dbuf=%p)\n",
- dn, list_head(&dn->dn_dbufs));
- }
- }
-
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +404,6 @@ dnode_evict_dbufs(dnode_t *dn, boolean_t try)
dn->dn_bonus = NULL;
}
rw_exit(&dn->dn_struct_rwlock);
- return (0);
}
static void
@@ -459,7 +443,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -565,6 +549,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_blksz[txgoff] = 0;
}
+ if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -588,15 +581,16 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range(dn,
rp->fr_blkid, rp->fr_nblks, tx);
}
+ /* grab the mutex so we don't race with dnode_block_freed() */
mutex_enter(&dn->dn_mtx);
for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+
free_range_t *last = rp;
rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
avl_remove(&dn->dn_ranges[txgoff], last);
kmem_free(last, sizeof (free_range_t));
}
mutex_exit(&dn->dn_mtx);
-
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
dnode_sync_free(dn, tx);
return;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index ccb0d8d8bf..90c6ca4e15 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -691,14 +691,13 @@ struct tempreserve {
* be canceled, using dsl_dir_tempreserve_clear().
*/
static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
- uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize,
+ boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
uint64_t est_used, quota, parent_rsrv;
int edquot = EDQUOT;
int txgidx = txg & TXG_MASK;
- boolean_t ismos;
int i;
struct tempreserve *tr;
@@ -718,7 +717,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
* If this transaction will result in a net free of space, we want
* to let it through.
*/
- if (netfree || dd->dd_phys->dd_quota == 0)
+ if (netfree || noquota || dd->dd_phys->dd_quota == 0)
quota = UINT64_MAX;
else
quota = dd->dd_phys->dd_quota;
@@ -732,8 +731,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
- ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
- if (dd->dd_parent == NULL || ismos) {
+ if (dd->dd_parent == NULL) {
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
if (poolsize < quota) {
quota = poolsize;
@@ -773,9 +771,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
list_insert_tail(tr_list, tr);
/* see if it's OK with our parent */
- if (dd->dd_parent && parent_rsrv && !ismos) {
+ if (dd->dd_parent && parent_rsrv) {
+ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
return (dsl_dir_tempreserve_impl(dd->dd_parent,
- parent_rsrv, netfree, tr_list, tx));
+ parent_rsrv, netfree, ismos, tr_list, tx));
} else {
return (0);
}
@@ -800,7 +800,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
ASSERT3S(asize, >=, 0);
ASSERT3S(fsize, >=, 0);
- err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE,
tr_list, tx);
if (err == 0) {
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 8be0fe18ff..4fcc6bfd79 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -534,8 +534,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index ec2cd64dc3..0106fc5cf4 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1016,12 +1016,15 @@ bp_get_dasize(spa_t *spa, const blkptr_t *bp)
if (!spa->spa_deflate)
return (BP_GET_ASIZE(bp));
+ spa_config_enter(spa, RW_READER, FTAG);
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
vdev_t *vd =
vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ if (vd)
+ sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
+ SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
+ spa_config_exit(spa, FTAG);
return (sz);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index d33657b9e6..9d5afdde86 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -239,7 +239,7 @@ typedef struct dbuf_hash_table {
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+void dbuf_create_bonus(struct dnode *dn);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 89a36b319e..6c9867df8a 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -157,7 +157,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
objset_t **osp);
void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
int dmu_objset_create(const char *name, dmu_objset_type_t type,
objset_t *clone_parent,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
@@ -294,6 +294,7 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
*/
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
/*
* Obtain the DMU buffer from the specified object which contains the
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index fbc1450b47..775a777eef 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -108,7 +108,7 @@ uint64_t dmu_objset_fsid_guid(objset_t *os);
int dmu_objset_find(char *name, int func(char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
/* called from dsl */
void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 02f9de3f06..01b1afe557 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -64,6 +64,7 @@ extern "C" {
#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -156,6 +157,7 @@ typedef struct dnode {
uint64_t dn_maxblkid;
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
/* protected by os_lock: */
@@ -197,11 +199,12 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
uint64_t object);
void dnode_special_close(dnode_t *dn);
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
int dnode_hold(struct objset_impl *dd, uint64_t object,
void *ref, dnode_t **dnp);
int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
@@ -226,7 +229,7 @@ void dnode_init(void);
void dnode_fini(void);
int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, boolean_t try);
+void dnode_evict_dbufs(dnode_t *dn);
#ifdef ZFS_DEBUG
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index ffcc7dcd00..4bcaaf368d 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -765,8 +765,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
error = dmu_bonus_hold(mos, object, FTAG, &db);
if (error)
return (error);
- ASSERT3U(db->db_size, ==, sizeof (smo));
- bcopy(db->db_data, &smo, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (smo));
+ bcopy(db->db_data, &smo, sizeof (smo));
ASSERT3U(smo.smo_object, ==, object);
dmu_buf_rele(db, FTAG);
}
@@ -1234,8 +1234,8 @@ vdev_dtl_load(vdev_t *vd)
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(db->db_data, smo, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(db->db_data, smo, sizeof (*smo));
dmu_buf_rele(db, FTAG);
mutex_enter(&vd->vdev_dtl_lock);
@@ -1305,8 +1305,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 1b7661f485..c9723f4858 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -1132,12 +1132,9 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
}
/*
- * Evict all dbufs so that cached znodes will be freed
+ * Evict cached data
*/
- if (dmu_objset_evict_dbufs(os, B_TRUE)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(os, B_FALSE);
- }
+ (void) dmu_objset_evict_dbufs(os);
/*
* Finally close the objset