diff options
author | Paul Dagnelie <pcd@delphix.com> | 2016-09-20 10:02:29 -0700 |
---|---|---|
committer | Prakash Surya <prakash.surya@delphix.com> | 2018-06-19 12:28:21 -0700 |
commit | 738e2a3ce3b2579222d6855e7fe75b5bcfcddf8d (patch) | |
tree | 96d8cf01f46a0b50aae9d7cd9f51da778de7fe59 | |
parent | 0dd498c0020c4a0dd34d350b07f97399a475d102 (diff) | |
download | illumos-joyent-738e2a3ce3b2579222d6855e7fe75b5bcfcddf8d.tar.gz |
9438 Holes can lose birth time info if a block has a mix of birth times
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_object.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dnode.c | 68 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dnode_sync.c | 54 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_znode.c | 3 |
4 files changed, 104 insertions, 25 deletions
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index aede315502..b853081e8b 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -167,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index aa00acec8e..53a530da0a 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -1507,6 +1507,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -1658,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); + dnode_dirty_l1range(dn, first, last, tx); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 2fcaf7927d..033e30dc82 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; @@ -283,32 +298,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); - free_children(subdb, blkid, nblks, tx); + free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } - /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); free_blocks(dn, db->db_blkptr, 1, tx); - } else { - /* - * Partial block free; must be marked dirty so that it - * will be written out. - */ - ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); @@ -321,7 +320,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; @@ -361,7 +360,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, tx); + free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } @@ -380,6 +379,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void @@ -389,7 +389,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } @@ -670,6 +671,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 73e17a4a3c..93545ee4a1 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1603,7 +1603,8 @@ zfs_trunc(znode_t *zp, uint64_t end) return (0); } - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); if (error) { zfs_range_unlock(rl); return (error); |