diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs/dsl_scrub.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_scrub.c | 157 |
1 files changed, 118 insertions, 39 deletions
diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c index d511bb841a..48c900cbc3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c +++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c @@ -53,6 +53,7 @@ static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; extern int zfs_txg_timeout; @@ -78,6 +79,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg = 0; dp->dp_scrub_max_txg = tx->tx_txg; + dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; if (*funcp == SCRUB_FUNC_CLEAN) { vdev_t *rvd = dp->dp_spa->spa_root_vdev; @@ -101,6 +103,14 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (dp->dp_scrub_min_txg > TXG_INITIAL) + dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; + dp->dp_spa->spa_scrub_started = B_TRUE; } @@ -119,8 +129,8 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); dp->dp_scrub_restart = B_FALSE; - dp->dp_scrub_ditto = B_FALSE; dp->dp_spa->spa_scrub_errors = 0; VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -136,9 +146,17 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, &dp->dp_scrub_max_txg, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &dp->dp_spa->spa_scrub_errors, tx)); @@ -186,6 +204,7 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj, tx)); dp->dp_scrub_queue_obj = 0; bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_QUEUE, tx)); @@ -200,6 +219,11 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, tx)); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, tx); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, "complete=%u", *completep); @@ -296,7 +320,7 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, } static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) { int elapsed_ticks; int mintime; @@ -308,7 +332,7 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ - if (zb->zb_level != 0) + if (zb != NULL && zb->zb_level != 0) return (B_FALSE); mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : @@ -316,11 +340,23 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time; if (elapsed_ticks > hz * zfs_txg_timeout || (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { - dprintf("pausing at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + dp->dp_scrub_bookmark = *zb; + } + if (ddb) { + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); + } dp->dp_scrub_pausing = B_TRUE; - dp->dp_scrub_bookmark = *zb; return (B_TRUE); } return (B_FALSE); @@ -423,7 +459,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, if (bp->blk_birth <= dp->dp_scrub_min_txg) return; - if (scrub_pause(dp, zb)) + if (scrub_pause(dp, zb, NULL)) return; if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { @@ -525,7 +561,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } } - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + /* + * If dsl_pool_scrub_ddt() has aready scrubbed this block, + * don't scrub it again. + */ + if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (buf) (void) arc_buf_remove_ref(buf, &buf); } @@ -542,7 +584,6 @@ scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); } - } static void @@ -564,8 +605,8 @@ dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET, - 0, 0, 0); + SET_BOOKMARK(&dp->dp_scrub_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, ds->ds_object, tx) != 0) { return; @@ -771,34 +812,65 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) return (0); } +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Then the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can. There are two cases to consider. + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference count changes; if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + * + * The code does not actually use the refcnt directly, but rather uses the + * dde's replication class (enum ddt_class), which serves the same purpose. + */ static void -dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type, - enum ddt_class class) +dsl_pool_scrub_ddt(dsl_pool_t *dp) { - ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c); + ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; ddt_entry_t dde; - blkptr_t blk; - zbookmark_t zb = { 0 }; - uint64_t walk = 0; int error; - if (!ddt_object_exists(ddt, type, class)) - return; - - while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) { - int p = DDT_PHYS_DITTO; - ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk); - scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); + while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { + if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) + return; + dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); + if (scrub_pause(dp, NULL, ddb)) + return; } ASSERT(error == ENOENT); + ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); } -static void -dsl_pool_scrub_ditto(dsl_pool_t *dp) +void +dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, + const ddt_entry_t *dde) { - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - for (enum ddt_type type = 0; type < DDT_TYPES; type++) - dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO); + const ddt_key_t *ddk = &dde->dde_key; + const ddt_phys_t *ddp = dde->dde_phys; + blkptr_t blk; + zbookmark_t zb = { 0 }; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(checksum, ddk, ddp, &blk); + scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); + } } void @@ -840,9 +912,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); spa->spa_scrub_active = B_TRUE; - if (!dp->dp_scrub_ditto) { - dsl_pool_scrub_ditto(dp); - dp->dp_scrub_ditto = B_TRUE; + if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { + dsl_pool_scrub_ddt(dp); + if (dp->dp_scrub_pausing) + goto out; } if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { @@ -895,12 +968,18 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); return; out: - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &spa->spa_scrub_errors, tx)); |