diff options
-rw-r--r-- | usr/src/cmd/zdb/zdb.c | 52 | ||||
-rw-r--r-- | usr/src/cmd/ztest/ztest.c | 4 | ||||
-rw-r--r-- | usr/src/common/zfs/zpool_prop.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/ddt.c | 73 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_traverse.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_pool.c | 14 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_scrub.c | 157 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/ddt.h | 23 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu_traverse.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dsl_pool.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 6 |
14 files changed, 275 insertions, 95 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index c05d20643f..2ca0460605 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -94,14 +94,15 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsvhL] [-S user:cksumalg] " + "Usage: %s [-CumdibcsvhL] " "poolname [object...]\n" " %s [-div] dataset [object...]\n" " %s -m [-L] poolname [vdev [metaslab...]]\n" " %s -R poolname vdev:offset:size[:flags]\n" + " %s -S poolname\n" " %s -l device\n" " %s -C\n\n", - cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -549,7 +550,7 @@ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; - ddt_bp_create(ddt, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); sprintf_blkptr(blkbuf, &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, @@ -686,7 +687,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) (void) printf("%s contents:\n\n", name); - while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); ASSERT(error == ENOENT); @@ -1344,7 +1345,8 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) nicenum(doi.doi_physical_blocks_512 << 9, asize); nicenum(doi.doi_bonus_size, bonus_size); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * - doi.doi_data_block_size / doi.doi_max_offset); + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); aux[0] = '\0'; @@ -1865,26 +1867,28 @@ static space_map_ops_t zdb_space_map_ops = { }; static void -zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - zdb_cb_t *zcb) +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { - uint64_t walk = 0; + ddt_bookmark_t ddb = { 0 }; ddt_entry_t dde; int error; - if (class == DDT_CLASS_UNIQUE || !ddt_object_exists(ddt, type, class)) - return; - - while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) { + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) continue; - ddt_bp_create(ddt, &dde.dde_key, ddp, &blk); + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { - zdb_count_block(ddt->ddt_spa, NULL, zcb, &blk, + zdb_count_block(spa, NULL, zcb, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += @@ -1893,6 +1897,7 @@ zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } } if (!dump_opt['L']) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); @@ -1924,12 +1929,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - for (enum ddt_type type = 0; type < DDT_TYPES; type++) - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) - zdb_ddt_leak_init(spa->spa_ddt[c], - type, class, zcb); + zdb_ddt_leak_init(spa, zcb); spa_config_exit(spa, SCL_CONFIG, FTAG); } @@ -1957,6 +1957,7 @@ dump_block_stats(spa_t *spa) zdb_cb_t zcb = { 0 }; zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; int leaks = 0; (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", @@ -2000,7 +2001,10 @@ dump_block_stats(spa_t *spa) bplist_close(bpl); } - zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0); + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); @@ -2170,7 +2174,8 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_numnodes(t)); } - if (BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); @@ -2205,7 +2210,8 @@ dump_simulated_ddt(spa_t *spa) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - (void) traverse_pool(spa, zdb_ddt_add_cb, &t, 0); + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + zdb_ddt_add_cb, &t); spa_config_exit(spa, SCL_CONFIG, FTAG); diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index 215aad2bea..67164993d2 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -837,7 +837,6 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, return (error); } -#if 0 static int ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) { @@ -860,7 +859,6 @@ ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) return (error); } -#endif static void ztest_rll_init(rll_t *rll) @@ -4134,10 +4132,8 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) (void) rw_rdlock(&zs->zs_name_lock); -#if 0 (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); -#endif VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); diff --git a/usr/src/common/zfs/zpool_prop.c b/usr/src/common/zfs/zpool_prop.c index 428afbb092..c8a3ca205f 100644 --- a/usr/src/common/zfs/zpool_prop.c +++ b/usr/src/common/zfs/zpool_prop.c @@ -90,6 +90,8 @@ zpool_prop_init(void) /* default number properties */ register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION"); + register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO"); /* default index (boolean) properties */ register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, @@ -109,8 +111,6 @@ zpool_prop_init(void) /* hidden properties */ register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME"); - register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_POOL, "DEDUPDITTO"); } /* diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 200ed48dbd..289d5490eb 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -32,6 +32,7 @@ #include <sys/zap.h> #include <sys/dmu_tx.h> #include <sys/arc.h> +#include <sys/dsl_pool.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> @@ -158,7 +159,7 @@ ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, int ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, uint64_t *walk) + uint64_t *walk, ddt_entry_t *dde) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -212,8 +213,8 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) } void -ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp, - blkptr_t *bp) +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) { BP_ZERO(bp); @@ -225,7 +226,7 @@ ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp, BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); - BP_SET_CHECKSUM(bp, ddt->ddt_checksum); + BP_SET_CHECKSUM(bp, checksum); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); @@ -277,7 +278,7 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) { blkptr_t blk; - ddt_bp_create(ddt, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_phys_clear(ddp); zio_free(ddt->ddt_spa, txg, &blk); } @@ -750,6 +751,30 @@ ddt_unload(spa_t *spa) } } +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + ddt_entry_t * ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) { @@ -820,7 +845,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) ddp->ddp_phys_birth != rddp->ddp_phys_birth || bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) continue; - ddt_bp_create(ddt, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); @@ -845,7 +870,7 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); - ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); @@ -857,6 +882,7 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) static void ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) { + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; enum ddt_type otype = dde->dde_type; @@ -905,6 +931,11 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + if (dp->dp_scrub_func != SCRUB_FUNC_NONE && + oclass > dp->dp_scrub_ddt_class_max && + nclass <= dp->dp_scrub_ddt_class_max) + dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde); } } @@ -968,3 +999,31 @@ ddt_sync(spa_t *spa, uint64_t txg) dmu_tx_commit(tx); } + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index b85a018c24..d0dbef075e 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -375,7 +375,8 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int -traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start) +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) { int err; uint64_t obj; @@ -384,7 +385,7 @@ traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start) /* visit the MOS */ err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), - txg_start, TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg); + txg_start, flags, func, arg); if (err) return (err); @@ -408,8 +409,7 @@ traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start) return (err); if (ds->ds_phys->ds_prev_snap_txg > txg) txg = ds->ds_phys->ds_prev_snap_txg; - err = traverse_dataset(ds, txg, - TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg); + err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); if (err) return (err); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 629393915c..ed465f76a7 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -171,11 +171,23 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark); + if (err && err != ENOENT) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max); + if (err && err != ENOENT) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &spa->spa_scrub_errors); if (err) diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c index d511bb841a..48c900cbc3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c +++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c @@ -53,6 +53,7 @@ static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; extern int zfs_txg_timeout; @@ -78,6 +79,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg = 0; dp->dp_scrub_max_txg = tx->tx_txg; + dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; if (*funcp == SCRUB_FUNC_CLEAN) { vdev_t *rvd = dp->dp_spa->spa_root_vdev; @@ -101,6 +103,14 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (dp->dp_scrub_min_txg > TXG_INITIAL) + dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; + dp->dp_spa->spa_scrub_started = B_TRUE; } @@ -119,8 +129,8 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); dp->dp_scrub_restart = B_FALSE; - dp->dp_scrub_ditto = B_FALSE; dp->dp_spa->spa_scrub_errors = 0; VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -136,9 +146,17 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, &dp->dp_scrub_max_txg, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &dp->dp_spa->spa_scrub_errors, tx)); @@ -186,6 +204,7 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj, tx)); dp->dp_scrub_queue_obj = 0; bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_QUEUE, tx)); @@ -200,6 +219,11 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, tx)); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, tx); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, "complete=%u", *completep); @@ -296,7 +320,7 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, } static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) { int elapsed_ticks; int mintime; @@ -308,7 +332,7 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ - if (zb->zb_level != 0) + if (zb != NULL && zb->zb_level != 0) return (B_FALSE); mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : @@ -316,11 +340,23 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time; if (elapsed_ticks > hz * zfs_txg_timeout || (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { - dprintf("pausing at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + dp->dp_scrub_bookmark = *zb; + } + if (ddb) { + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); + } dp->dp_scrub_pausing = B_TRUE; - dp->dp_scrub_bookmark = *zb; return (B_TRUE); } return (B_FALSE); @@ -423,7 +459,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, if (bp->blk_birth <= dp->dp_scrub_min_txg) return; - if (scrub_pause(dp, zb)) + if (scrub_pause(dp, zb, NULL)) return; if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { @@ -525,7 +561,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } } - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + /* + * If dsl_pool_scrub_ddt() has aready scrubbed this block, + * don't scrub it again. + */ + if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (buf) (void) arc_buf_remove_ref(buf, &buf); } @@ -542,7 +584,6 @@ scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); } - } static void @@ -564,8 +605,8 @@ dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET, - 0, 0, 0); + SET_BOOKMARK(&dp->dp_scrub_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, ds->ds_object, tx) != 0) { return; @@ -771,34 +812,65 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) return (0); } +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Then the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can. There are two cases to consider. + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference count changes; if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + * + * The code does not actually use the refcnt directly, but rather uses the + * dde's replication class (enum ddt_class), which serves the same purpose. + */ static void -dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type, - enum ddt_class class) +dsl_pool_scrub_ddt(dsl_pool_t *dp) { - ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c); + ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; ddt_entry_t dde; - blkptr_t blk; - zbookmark_t zb = { 0 }; - uint64_t walk = 0; int error; - if (!ddt_object_exists(ddt, type, class)) - return; - - while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) { - int p = DDT_PHYS_DITTO; - ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk); - scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); + while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { + if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) + return; + dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); + if (scrub_pause(dp, NULL, ddb)) + return; } ASSERT(error == ENOENT); + ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); } -static void -dsl_pool_scrub_ditto(dsl_pool_t *dp) +void +dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, + const ddt_entry_t *dde) { - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - for (enum ddt_type type = 0; type < DDT_TYPES; type++) - dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO); + const ddt_key_t *ddk = &dde->dde_key; + const ddt_phys_t *ddp = dde->dde_phys; + blkptr_t blk; + zbookmark_t zb = { 0 }; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(checksum, ddk, ddp, &blk); + scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); + } } void @@ -840,9 +912,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); spa->spa_scrub_active = B_TRUE; - if (!dp->dp_scrub_ditto) { - dsl_pool_scrub_ditto(dp); - dp->dp_scrub_ditto = B_TRUE; + if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { + dsl_pool_scrub_ddt(dp); + if (dp->dp_scrub_pausing) + goto out; } if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { @@ -895,12 +968,18 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); return; out: - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &spa->spa_scrub_errors, tx)); diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 29b7bacfc8..2fa77924a1 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1230,8 +1230,8 @@ spa_load_verify(spa_t *spa) rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); - error = traverse_pool(spa, spa_load_verify_cb, rio, - spa->spa_verify_min_txg); + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); (void) zio_wait(rio); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 59e5ca04c4..02d6ff35c4 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -840,8 +840,8 @@ spa_l2cache_activate(vdev_t *vd) uint64_t spa_vdev_enter(spa_t *spa) { - mutex_enter(&spa_namespace_lock); mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); return (spa_vdev_config_enter(spa)); } @@ -937,8 +937,8 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); - mutex_exit(&spa->spa_vdev_top_lock); mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); return (error); } diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index aed141eb81..7dbb62a597 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -153,6 +153,19 @@ struct ddt { avl_node_t ddt_node; }; +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ typedef struct ddt_ops { char ddt_op_name[32]; int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, @@ -173,7 +186,7 @@ typedef struct ddt_ops { extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, char *name); extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class class, ddt_entry_t *dde, uint64_t *walk); + enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class); extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, @@ -183,7 +196,7 @@ extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg); -extern void ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); @@ -214,13 +227,14 @@ extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); -extern ddt_t *ddt_select_by_checksum(spa_t *spa, enum zio_checksum c); - extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); @@ -230,6 +244,7 @@ extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); extern const ddt_ops_t ddt_zap_ops; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 61366986a9..f7a3b2cd44 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -211,6 +211,10 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* 4x8 zbookmark_t */ #define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" +/* 4x8 ddt_bookmark_t */ +#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark" +/* 1x8 max_class */ +#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max" /* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ #define DMU_POOL_SCRUB_QUEUE "scrub_queue" /* 1x8 txg */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index 04b793dc40..e51d9bb5a7 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -47,9 +47,10 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, #define TRAVERSE_PREFETCH_DATA (1<<3) #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) -int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int flags, blkptr_cb_t func, void *arg); -int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start); +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 78d97526db..1afbc3813b 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -32,6 +32,7 @@ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/dnode.h> +#include <sys/ddt.h> #ifdef __cplusplus extern "C" { @@ -96,13 +97,14 @@ typedef struct dsl_pool { uint64_t dp_scrub_queue_obj; uint64_t dp_scrub_min_txg; uint64_t dp_scrub_max_txg; + uint64_t dp_scrub_start_time; + uint64_t dp_scrub_ddt_class_max; zbookmark_t dp_scrub_bookmark; + ddt_bookmark_t dp_scrub_ddt_bookmark; boolean_t dp_scrub_pausing; boolean_t dp_scrub_isresilver; - uint64_t dp_scrub_start_time; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ boolean_t dp_scrub_restart; - boolean_t dp_scrub_ditto; + kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ /* Has its own locking */ tx_state_t dp_tx; @@ -145,6 +147,8 @@ int dsl_pool_scrub_cancel(dsl_pool_t *dp); int dsl_pool_scrub_clean(dsl_pool_t *dp); void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_scrub_restart(dsl_pool_t *dp); +void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, + const ddt_entry_t *dde); taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 3216427dfe..18b3cbf693 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -867,6 +867,9 @@ zio_read_bp_init(zio_t *zio) if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; + if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -1736,7 +1739,8 @@ zio_ddt_read_start(zio_t *zio) for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; - ddt_bp_create(ddt, &dde->dde_key, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, + &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_buf_alloc(zio->io_size), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, |