summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Dagnelie <pcd@delphix.com>2016-09-20 10:02:29 -0700
committerPrakash Surya <prakash.surya@delphix.com>2018-06-19 12:28:21 -0700
commit738e2a3ce3b2579222d6855e7fe75b5bcfcddf8d (patch)
tree96d8cf01f46a0b50aae9d7cd9f51da778de7fe59
parent0dd498c0020c4a0dd34d350b07f97399a475d102 (diff)
downloadillumos-joyent-738e2a3ce3b2579222d6855e7fe75b5bcfcddf8d.tar.gz
9438 Holes can lose birth time info if a block has a mix of birth times
Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com>
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_object.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c68
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c54
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c3
4 files changed, 104 insertions, 25 deletions
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index aede315502..b853081e8b 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -167,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ /*
+ * If we don't create this free range, we'll leak indirect blocks when
+ * we get to freeing the dnode in syncing context.
+ */
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index aa00acec8e..53a530da0a 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -1507,6 +1507,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
}
}
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db;
+ avl_index_t where;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ for (;;) {
+
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ if (db == NULL || db->db_level != 1 ||
+ db->db_blkid >= end_blkid) {
+ break;
+ }
+
+ /*
+ * Setup the next blkid we want to search for.
+ */
+ db_search.db_blkid = db->db_blkid + 1;
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /*
+ * If the dbuf transitions to DB_EVICTING while we're trying
+ * to dirty it, then we will be unable to discover it in
+ * the dbuf hash table. This will result in a call to
+ * dbuf_create() which needs to acquire the dn_dbufs_mtx
+ * lock. To avoid a deadlock, we drop the lock before
+ * dirtying the level-1 dbuf.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_dirty_l1(dn, db->db_blkid, tx);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+ */
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_level != 1 || db->db_blkid >= end_blkid)
+ break;
+ ASSERT(db->db_dirtycnt > 0);
+ }
+#endif
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
@@ -1658,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (last != first)
dnode_dirty_l1(dn, last, tx);
+ dnode_dirty_l1range(dn, first, last, tx);
+
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 2fcaf7927d..033e30dc82 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
+/*
+ * We don't usually free the indirect blocks here. If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children(). If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed. Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
@@ -283,32 +298,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
- free_children(subdb, blkid, nblks, tx);
+ free_children(subdb, blkid, nblks, free_indirects, tx);
dbuf_rele(subdb, FTAG);
}
}
- /* If this whole block is free, free ourself too. */
- for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
- if (!BP_IS_HOLE(bp))
- break;
- }
- if (i == 1 << epbs) {
- /*
- * We only found holes. Grab the rwlock to prevent
- * anybody from reading the blocks we're about to
- * zero out.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (free_indirects) {
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+ ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
- rw_exit(&dn->dn_struct_rwlock);
free_blocks(dn, db->db_blkptr, 1, tx);
- } else {
- /*
- * Partial block free; must be marked dirty so that it
- * will be written out.
- */
- ASSERT(db->db_dirtycnt > 0);
}
DB_DNODE_EXIT(db);
@@ -321,7 +320,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
*/
static void
dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
int dnlevel = dn->dn_phys->dn_nlevels;
@@ -361,7 +360,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
- free_children(db, blkid, nblks, tx);
+ free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
}
@@ -380,6 +379,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
+ boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;
static void
@@ -389,7 +389,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
dnode_t *dn = dsfra->dsfra_dnode;
mutex_exit(&dn->dn_mtx);
- dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+ dnode_sync_free_range_impl(dn, blkid, nblks,
+ dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}
@@ -670,6 +671,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
+ dsfra.dsfra_free_indirects = freeing_dnode;
+ if (freeing_dnode) {
+ ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+ 0, dn->dn_maxblkid + 1));
+ }
mutex_enter(&dn->dn_mtx);
range_tree_vacate(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 73e17a4a3c..93545ee4a1 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -1603,7 +1603,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
return (0);
}
- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
if (error) {
zfs_range_unlock(rl);
return (error);